In [None]:
import json
from bs4 import BeautifulSoup
import pandas as pd

def get_formatted_text(element):
    """Extract and format text by removing any special formatting characters, focusing on the plain text content."""
    if element is None:
        return ''

    text_parts = []
    for part in element.contents:
        if part.name == 'Format':
            # Join the strings and replace newline characters with spaces
            formatted_text = ''.join(part.strings).replace('\n', '')

            # The following checks are now redundant since we're not adding special characters
            # But they indicate where you might handle special formatting logic if needed

            # text_parts.append(formatted_text) can be used directly without conditionals
            text_parts.append(formatted_text)
        else:
            # Replace newline characters with spaces for non-format elements
            text_parts.append(str(part).replace('\n', ''))

    return ''.join(text_parts).strip()

def process_group(group):
    """Extracts details from a single group, including nested formatting."""
    group_details = {
        'Level': group.get('Level'),
        'Count': group.get('Count'),
    }

    for tag in group.find_all(recursive=False):
        # Skip nested groups for simplicity, but you could extend this to handle them similarly
        if tag.name == 'Group':
            continue
        formatted_text = get_formatted_text(tag)
        group_details[tag.name] = formatted_text

    return group_details

def process_headword(headword):
    """Dynamically processes each headword, extracting details for all direct children."""
    # Basic headword information
    headword_data = {
        'ID': headword.get('ID'),
        'SortAs': headword.get('SortAs'),
        'Level': headword.get('Level'),
    }

    # Initialize a container for groups, to be processed separately
    groups_data = []

    # Dynamically process all direct children of the headword
    for child in headword.find_all(recursive=False):
        if child.name == 'Group':
            # Handle groups separately
            groups_data.append(process_group(child))
        else:
            # For non-group elements, apply formatting and add to headword_data
            formatted_text = get_formatted_text(child)
            # Use the tag name as the key and formatted text as the value
            headword_data[child.name] = formatted_text

    # Serialize group details if any groups are present
    if groups_data:
        headword_data['Groups'] = json.dumps(groups_data, ensure_ascii=False)
    else:
        headword_data['Groups'] = json.dumps([])  # Ensure consistency in output

    return headword_data

def parse_xml_to_dataframe(xml_file_path):
    """Parses the XML, returning a DataFrame with headwords and condensed group info."""
    with open(xml_file_path, 'r', encoding='utf-8') as file:
        xml_content = file.read()

    soup = BeautifulSoup(xml_content, 'xml')
    data = [process_headword(hw) for hw in soup.find_all('Headword')]

    return pd.DataFrame(data)

# XML file path
xml_file_path = '/content/Kroatisch-Nederlands.xml'
df_2013 = parse_xml_to_dataframe(xml_file_path)

In [None]:
df_2013

Unnamed: 0,ID,SortAs,Level,HW,GR,PR,KG,RZ,P2,LW,...,PL,TX,LS,KN,VI,B,MN,ST,AV,RE
0,0,a1,Incomplete,a1,n/m,,"slovo, glas",,a,d,...,,,,,,,,,,
1,1,a2,Incomplete,a2,conj,,,,maar,,...,,,,,,,,,,
2,2,a3,Incomplete,a3,m,,,,a,d,...,,,,,,,,,,
3,3,abeceda,Incomplete,abeced|a,f,,,,alfabet,h,...,,,,,,,,,,
4,4,abecedni,Incomplete,abecedni,adj,,,,alfabetisch,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36304,36123,žvakaći,Incomplete,žvakać|i,adj,,,,kauw-,h/d,...,,,,,,,,,,
36305,36124,žvakati,Incomplete,žvakati,"tr, i",,,,kauwen,,...,,,,,,,,,,
36306,36125,žvalav,Incomplete,žvalav,adj,,,,,,...,,met mondhoekkloofjes,,,,,,,,
36307,36126,žvale,Incomplete,žvale,f,,,,,,...,,,,,,,,,,


In [None]:
df_2013.Groups[9]

'[{"Level": "2", "Count": "1", "PR": "", "KG": "preuređenje", "P2": "herinrichting", "LW": "d", "RZ": ""}, {"Level": "2", "Count": "2", "PR": "", "KG": "obrad(b)a", "P2": "bewerking", "LW": "d", "RZ": ""}, {"Level": "2", "Count": "3", "PR": "", "KG": "prilagodba", "P2": "aanpassing", "LW": "d"}]'

In [None]:
df_2013.columns

Index(['ID', 'SortAs', 'Level', 'HW', 'GR', 'PR', 'KG', 'RZ', 'P2', 'LW', 'TR',
       'OC', 'FR', 'PF', 'Groups', 'KZ', 'LT', 'GS', 'UN', 'US', 'LJ', 'PL',
       'TX', 'LS', 'KN', 'VI', 'B', 'MN', 'ST', 'AV', 'RE'],
      dtype='object')

In [None]:
df_2013.KG.value_counts()

koga               9
što                7
u šahu             6
žena               5
o svjetlu          5
                  ..
glasanje pilića    1
u kartama          1
usnik              1
šištiti            1
površan            1
Name: KG, Length: 1162, dtype: int64

In [None]:
df_2013.Groups.isna().sum()

0

In [None]:
df_2013 = df_2013.drop(['ID', 'GR', 'RZ', 'LW', 'SortAs', 'Level', 'TR', 'OC', 'KZ', 'LT', 'LJ', 'AV', 'RE', 'KG', 'PR'], axis=1)

In [None]:
df_2013[df_2013['Groups']!='[]']

Unnamed: 0,HW,P2,FR,PF,Groups,GS,UN,US,PL,TX,LS,KN,VI,B,MN,ST
9,adaptacija,,,,"[{""Level"": ""2"", ""Count"": ""1"", ""PR"": """", ""KG"": ...",,,,,,,,,,,
10,adaptirati,,,,"[{""Level"": ""1"", ""Count"": ""1"", ""PR"": """", ""GR"": ...",adaptiram,,,,,,,,,,
20,adresar,,,,"[{""Level"": ""2"", ""Count"": ""1"", ""PR"": """", ""KG"": ...",adresara,,,,,,,,,,
36,afirmira|ti,,,,"[{""Level"": ""1"", ""Count"": ""1"", ""PR"": """", ""GR"": ...",afirmiram,,,,,,,,,,
45,agent,,,,"[{""Level"": ""2"", ""Count"": ""1"", ""PR"": """", ""KG"": ...",,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36297,žutiti,,,,"[{""Level"": ""1"", ""Count"": ""1"", ""PR"": """", ""GR"": ...",,,,,,,,,,,
36298,žut|jeti,,,,"[{""Level"": ""1"", ""Count"": ""1"", ""PR"": """", ""GR"": ...",žutim,,,,,,,,,,
36299,žutokljunac,,,,"[{""Level"": ""2"", ""Count"": ""1"", ""PR"": """", ""LS"": ...",,,,,,,,,,,
36307,žvale,,,,"[{""Level"": ""2"", ""Count"": ""1"", ""PR"": """", ""KG"": ...",,,,,,,,,,,


In [None]:
df_2013

Unnamed: 0,HW,P2,FR,PF,Groups,GS,UN,US,PL,TX,LS,KN,VI,B,MN,ST
0,a1,a,od ~ do ž,van a tot z,[],,,,,,,,,,,
1,a2,maar,"on radi puno, ~ opet ništa ne zaradi","hij werkt veel, maar toch verdient hij niets",[],,,,,,,,,,,
2,a3,a,a-mol,"a-klein, a-mineur",[],,,,,,,,,,,
3,abeced|a,alfabet,po ~i,op alfabetische volgorde,[],,,,,,,,,,,
4,abecedni,alfabetisch,,,[],,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36304,žvakać|i,kauw-,~a guma,kauwgum,[],,,,,,,,,,,
36305,žvakati,kauwen,,,[],žvačem,,,,,,,,,,
36306,žvalav,,,,[],,,,,met mondhoekkloofjes,,,,,,
36307,žvale,,,,"[{""Level"": ""2"", ""Count"": ""1"", ""PR"": """", ""KG"": ...",,,,,,,,,,,


In [None]:
# Initialize a dictionary to hold the count of '~' for each column
tilda_counts = {}

# Iterate through each column in the DataFrame
for column in df_2013.columns:
    # Use str.contains() to check for '~', sum() to count True values
    # na=False makes str.contains treat NaNs as False
    tilda_counts[column] = df_2013[column].str.contains('~', na=False).sum()

# Display the counts
print(tilda_counts)

{'ID': 0, 'SortAs': 0, 'Level': 0, 'HW': 0, 'GR': 0, 'PR': 0, 'KG': 0, 'RZ': 0, 'P2': 0, 'LW': 0, 'TR': 0, 'OC': 0, 'FR': 9572, 'PF': 0, 'Groups': 3132, 'KZ': 0, 'LT': 0, 'GS': 0, 'UN': 0, 'US': 0, 'LJ': 0, 'PL': 0, 'TX': 0, 'LS': 0, 'KN': 0, 'VI': 0, 'B': 0, 'MN': 0, 'ST': 0, 'AV': 0, 'RE': 0}


In [None]:
import re

# Function to remove digits from a string in the 'HW' column
def remove_digits(s):
    return re.sub(r'\d+', '', s)

# Remove digits from the 'HW' column
df_2013['HW'] = df_2013['HW'].apply(remove_digits)

# Extract the part of the 'HW' column before the '|'
replacement_text = df_2013['HW'].str.split('|').str[0]

# Function to replace "~" within the same word only if "~" is connected with a letter
def replace_tilda_in_fr_columns(row, replacement_series):
    fr_columns = [col for col in row.index if col.startswith('FR')]

    for column in fr_columns:
        if isinstance(row[column], str):  # Ensure the value is a string
            # Pattern to match "~" followed immediately by a letter, indicating it's part of the word
            pattern_within_word = r'~(?=\w)'
            # Pattern to match "~" that stands alone (surrounded by spaces or at start/end of string)
            pattern_standalone = r'(?:\s|^)~(?:\s|$)'

            # Replacement text from 'HW' up to "|"
            replacement = replacement_series[row.name]
            # Replace "~" within words
            row[column] = re.sub(pattern_within_word, replacement, row[column])
            # Handle standalone "~" by ensuring the replacement also stands alone (surrounded by spaces)
            replacement_for_standalone = f' {replacement} ' if replacement else ' '
            row[column] = re.sub(pattern_standalone, replacement_for_standalone, row[column])

    return row

# Apply the replace function across all rows, targeting only "FR_" columns
df_2013 = df_2013.apply(replace_tilda_in_fr_columns, axis=1, replacement_series=replacement_text)

In [None]:
df_2013

Unnamed: 0,ID,SortAs,Level,HW,GR,PR,KG,RZ,P2,LW,...,PL,TX,LS,KN,VI,B,MN,ST,AV,RE
0,0,a1,Incomplete,a,n/m,,"slovo, glas",,a,d,...,,,,,,,,,,
1,1,a2,Incomplete,a,conj,,,,maar,,...,,,,,,,,,,
2,2,a3,Incomplete,a,m,,,,a,d,...,,,,,,,,,,
3,3,abeceda,Incomplete,abeced|a,f,,,,alfabet,h,...,,,,,,,,,,
4,4,abecedni,Incomplete,abecedni,adj,,,,alfabetisch,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36304,36123,žvakaći,Incomplete,žvakać|i,adj,,,,kauw-,h/d,...,,,,,,,,,,
36305,36124,žvakati,Incomplete,žvakati,"tr, i",,,,kauwen,,...,,,,,,,,,,
36306,36125,žvalav,Incomplete,žvalav,adj,,,,,,...,,met mondhoekkloofjes,,,,,,,,
36307,36126,žvale,Incomplete,žvale,f,,,,,,...,,,,,,,,,,


In [None]:
df_2013.columns

Index(['ID', 'SortAs', 'Level', 'HW', 'GR', 'PR', 'KG', 'RZ', 'P2', 'LW', 'TR',
       'OC', 'FR', 'PF', 'Groups', 'KZ', 'LT', 'GS', 'UN', 'US', 'LJ', 'PL',
       'TX', 'LS', 'KN', 'VI', 'B', 'MN', 'ST', 'AV', 'RE'],
      dtype='object')

In [None]:
df_2013.KG.value_counts()

koga               9
što                7
u šahu             6
žena               5
o svjetlu          5
                  ..
glasanje pilića    1
u kartama          1
usnik              1
šištiti            1
površan            1
Name: KG, Length: 1162, dtype: int64

In [None]:
df_2013.drop(columns=['ID', 'SortAs', 'Level', 'GR', 'RZ', 'LW', 'TR', 'OC', 'KZ', 'LT', 'PL', 'GS', 'LJ', 'LS', 'MN', 'ST', 'AV',])

Unnamed: 0,HW,PR,KG,P2,FR,PF,Groups,UN,US,TX,KN,VI,B,RE
0,a,,"slovo, glas",a,od a do ž,van a tot z,[],,,,,,,
1,a,,,maar,"on radi puno, a opet ništa ne zaradi","hij werkt veel, maar toch verdient hij niets",[],,,,,,,
2,a,,,a,a-mol,"a-klein, a-mineur",[],,,,,,,
3,abeced|a,,,alfabet,po abecedi,op alfabetische volgorde,[],,,,,,,
4,abecedni,,,alfabetisch,,,[],,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36304,žvakać|i,,,kauw-,žvakaća guma,kauwgum,[],,,,,,,
36305,žvakati,,,kauwen,,,[],,,,,,,
36306,žvalav,,,,,,[],,,met mondhoekkloofjes,,,,
36307,žvale,,,,,,"[{""Level"": ""2"", ""Count"": ""1"", ""PR"": """", ""KG"": ...",,,,,,,


In [None]:
# DataFrame with rows where 'Groups' is exactly '[]'
df_with_empty_groups = df_2013[df_2013['Groups'].apply(lambda x: x == '[]')]

# DataFrame with rows where 'Groups' is not exactly '[]'
df_without_empty_groups = df_2013[df_2013['Groups'].apply(lambda x: x != '[]')]

In [None]:
df_with_empty_groups

Unnamed: 0,ID,SortAs,Level,HW,GR,PR,KG,RZ,P2,LW,...,PL,TX,LS,KN,VI,B,MN,ST,AV,RE
0,0,a1,Incomplete,a,n/m,,"slovo, glas",,a,d,...,,,,,,,,,,
1,1,a2,Incomplete,a,conj,,,,maar,,...,,,,,,,,,,
2,2,a3,Incomplete,a,m,,,,a,d,...,,,,,,,,,,
3,3,abeceda,Incomplete,abeced|a,f,,,,alfabet,h,...,,,,,,,,,,
4,4,abecedni,Incomplete,abecedni,adj,,,,alfabetisch,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36302,36121,žvačem,Incomplete,žvačem,pres,,,,,,...,,,,,žvakati,,,,,
36303,36122,žvaka,Incomplete,žvaka,f,,,,kauwgum,h/d,...,,,coll,,,,,,,
36304,36123,žvakaći,Incomplete,žvakać|i,adj,,,,kauw-,h/d,...,,,,,,,,,,
36305,36124,žvakati,Incomplete,žvakati,"tr, i",,,,kauwen,,...,,,,,,,,,,


In [None]:
# Parse the JSON strings in the 'Groups' column
df_without_empty_groups['Groups'] = df_without_empty_groups['Groups'].apply(lambda x: json.loads(x))

# Ensure the HW column is included before exploding
df_exploded = df_without_empty_groups.explode('Groups').reset_index().rename(columns={'index': 'original_index'})

# Normalize the exploded data
df_normalized = pd.json_normalize(df_exploded['Groups'])

# Merge back with the original index and HW column
df_final = pd.concat([df_exploded[['original_index', 'HW']].reset_index(drop=True), df_normalized.reset_index(drop=True)], axis=1)

# Check for any duplicated column names and append a number if found
for col in df_final.columns:
    if df_final.columns.tolist().count(col) > 1:  # If the column name appears more than once
        for suffix, occurrence in enumerate(df_final.loc[:, col].columns, start=1):
            df_final.columns.values[occurrence] = f"{col}_{suffix}"

df_final.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_without_empty_groups['Groups'] = df_without_empty_groups['Groups'].apply(lambda x: json.loads(x))


Unnamed: 0,original_index,HW,Level,Count,PR,KG,P2,LW,RZ,GR,...,TR,LS,VI,LJ,MN,ST,GS,AV,RE,PL
0,9,adaptacija,2,1,,preuređenje,herinrichting,d,,,...,,,,,,,,,,
1,9,adaptacija,2,2,,obrad(b)a,bewerking,d,,,...,,,,,,,,,,
2,9,adaptacija,2,3,,prilagodba,aanpassing,d,,,...,,,,,,,,,,
3,10,adaptirati,1,1,,,,,,"tr, i/p",...,,,,,,,,,,
4,10,adaptirati,1,2,,,zich aanpassen,,,"refl, i/p",...,,,,,,,,,,


In [None]:
df_final.iloc[3]

original_index            10
HW                adaptirati
Level                      1
Count                      1
PR                          
KG                       NaN
P2                       NaN
LW                       NaN
RZ                          
GR                   tr, i/p
SE                       NaN
OC                       NaN
FR                       NaN
PF                       NaN
KZ                       NaN
LT                       NaN
US                       NaN
TX                       NaN
B                        NaN
UN                       NaN
KN                       NaN
TR                       NaN
LS                       NaN
VI                       NaN
LJ                       NaN
MN                       NaN
ST                       NaN
GS                       NaN
AV                       NaN
RE                       NaN
PL                       NaN
Name: 3, dtype: object

In [None]:
df_final

Unnamed: 0,original_index,HW,Level,Count,PR,KG,P2,LW,RZ,GR,...,TR,LS,VI,LJ,MN,ST,GS,AV,RE,PL
0,9,adaptacija,2,1,,preuređenje,herinrichting,d,,,...,,,,,,,,,,
1,9,adaptacija,2,2,,obrad(b)a,bewerking,d,,,...,,,,,,,,,,
2,9,adaptacija,2,3,,prilagodba,aanpassing,d,,,...,,,,,,,,,,
3,10,adaptirati,1,1,,,,,,"tr, i/p",...,,,,,,,,,,
4,10,adaptirati,1,2,,,zich aanpassen,,,"refl, i/p",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12537,36307,žvale,2,1,,za konja,bit,h,,,...,,,,,,,,,,
12538,36307,žvale,2,2,,,mondhoekkloofjes,d,,,...,,,,,,,,,,
12539,36307,žvale,2,3,,usta,mond,d,,,...,,pej,,,,,,,,
12540,36308,žvaliti,1,1,,,tongen,,,"refl, i",...,,sl,,,,,,,,


In [None]:
df_final.columns

Index(['original_index', 'HW', 'Level', 'Count', 'PR', 'KG', 'P2', 'LW', 'RZ',
       'GR', 'SE', 'OC', 'FR', 'PF', 'KZ', 'LT', 'US', 'TX', 'B', 'UN', 'KN',
       'TR', 'LS', 'VI', 'LJ', 'MN', 'ST', 'GS', 'AV', 'RE', 'PL'],
      dtype='object')

In [None]:
df_final.drop(columns=['Level', 'GR', 'RZ', 'LW', 'TR', 'OC', 'KZ', 'LT', 'PL', 'GS', 'LJ', 'LS', 'MN', 'ST', 'AV',])

Unnamed: 0,original_index,HW,Count,PR,KG,P2,SE,FR,PF,US,TX,B,UN,KN,VI,RE
0,9,adaptacija,1,,preuređenje,herinrichting,,,,,,,,,,
1,9,adaptacija,2,,obrad(b)a,bewerking,,,,,,,,,,
2,9,adaptacija,3,,prilagodba,aanpassing,,,,,,,,,,
3,10,adaptirati,1,,,,,,,,,,,,,
4,10,adaptirati,2,,,zich aanpassen,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12537,36307,žvale,1,,za konja,bit,,,,,,,,,,
12538,36307,žvale,2,,,mondhoekkloofjes,,,,,,,,,,
12539,36307,žvale,3,,usta,mond,,,,,,,,,,
12540,36308,žvaliti,1,,,tongen,,,,,,,,,,


In [None]:
# Extract the part of the 'HW' column before the '|'
replacement_text = df_final['HW'].str.split('|').str[0]

# Function to replace "~" within the same word only if "~" is connected with a letter
def replace_tilda_in_fr_columns(row, replacement_series):
    fr_columns = [col for col in row.index if col.startswith('FR')]

    for column in fr_columns:
        if isinstance(row[column], str):  # Ensure the value is a string
            # Pattern to match "~" followed immediately by a letter, indicating it's part of the word
            pattern_within_word = r'~(?=\w)'
            # Pattern to match "~" that stands alone (surrounded by spaces or at start/end of string)
            pattern_standalone = r'(?:\s|^)~(?:\s|$)'

            # Replacement text from 'HW' up to "|"
            replacement = replacement_series[row.name]
            # Replace "~" within words
            row[column] = re.sub(pattern_within_word, replacement, row[column])
            # Handle standalone "~" by ensuring the replacement also stands alone (surrounded by spaces)
            replacement_for_standalone = f' {replacement} ' if replacement else ' '
            row[column] = re.sub(pattern_standalone, replacement_for_standalone, row[column])

    return row

# Apply the replace function across all rows, targeting only "FR_" columns
df_final = df_final.apply(replace_tilda_in_fr_columns, axis=1, replacement_series=replacement_text)

In [None]:
df_final_fr_pf = df_final[['FR', 'PF']]

In [None]:
df_final_fr_pf.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_fr_pf.dropna(inplace=True)


In [None]:
df_final_fr_pf

Unnamed: 0,FR,PF
8,afirmirao se kao slikar,hij maakte naam als schilder
13,stupiti u akciju,in actie komen
15,spolni akt,geslachtsdaad
55,aranžer izloga,etaleur
64,luksuzni artikl,luxeartikel
...,...,...
12517,operacija žuči,galblaasoperatie
12518,žučna rasprava,een felle discussie
12522,modni žurnal,modeblad
12523,filmski žurnal,filmjournaal


In [None]:
df_with_empty_groups_fr_pf = df_with_empty_groups[['FR', 'PF']]

In [None]:
df_with_empty_groups_fr_pf

Unnamed: 0,FR,PF
0,od a do ž,van a tot z
1,"on radi puno, a opet ništa ne zaradi","hij werkt veel, maar toch verdient hij niets"
2,a-mol,"a-klein, a-mineur"
3,po abecedi,op alfabetische volgorde
4,,
...,...,...
36302,,
36303,,
36304,žvakaća guma,kauwgum
36305,,


In [None]:
df_with_empty_groups_fr_pf.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_with_empty_groups_fr_pf.dropna(inplace=True)


In [None]:
df_final

Unnamed: 0,original_index,HW,Level,Count,PR,KG,P2,LW,RZ,GR,...,TR,LS,VI,LJ,MN,ST,GS,AV,RE,PL
0,9,adaptacija,2,1,,preuređenje,herinrichting,d,,,...,,,,,,,,,,
1,9,adaptacija,2,2,,obrad(b)a,bewerking,d,,,...,,,,,,,,,,
2,9,adaptacija,2,3,,prilagodba,aanpassing,d,,,...,,,,,,,,,,
3,10,adaptirati,1,1,,,,,,"tr, i/p",...,,,,,,,,,,
4,10,adaptirati,1,2,,,zich aanpassen,,,"refl, i/p",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12537,36307,žvale,2,1,,za konja,bit,h,,,...,,,,,,,,,,
12538,36307,žvale,2,2,,,mondhoekkloofjes,d,,,...,,,,,,,,,,
12539,36307,žvale,2,3,,usta,mond,d,,,...,,pej,,,,,,,,
12540,36308,žvaliti,1,1,,,tongen,,,"refl, i",...,,sl,,,,,,,,


In [None]:
# Combine the dataframes
combined_df_fr_pf = pd.concat([df_with_empty_groups_fr_pf, df_final_fr_pf], ignore_index=True)

combined_df_fr_pf

Unnamed: 0,FR,PF
0,od a do ž,van a tot z
1,"on radi puno, a opet ništa ne zaradi","hij werkt veel, maar toch verdient hij niets"
2,a-mol,"a-klein, a-mineur"
3,po abecedi,op alfabetische volgorde
4,ova primjedba ide na tvoju adresu,dit is een opmerking aan jouw adres
...,...,...
14617,operacija žuči,galblaasoperatie
14618,žučna rasprava,een felle discussie
14619,modni žurnal,modeblad
14620,filmski žurnal,filmjournaal


In [None]:
df_with_empty_groups_hw_p2 = df_with_empty_groups[['HW', 'P2']]

In [None]:
df_with_empty_groups_hw_p2

Unnamed: 0,HW,P2
0,a,a
1,a,maar
2,a,a
3,abeced|a,alfabet
4,abecedni,alfabetisch
...,...,...
36302,žvačem,
36303,žvaka,kauwgum
36304,žvakać|i,kauw-
36305,žvakati,kauwen


In [None]:
df_with_empty_groups_us_p2 = df_with_empty_groups[['US', 'P2']]

In [None]:
df_with_empty_groups_us_p2.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_with_empty_groups_us_p2.dropna(inplace=True)


In [None]:
df_with_empty_groups_us_p2

Unnamed: 0,US,P2
24,odvjetnik,advocaat
26,odvjetnički,advocaten-
35,sklonost,affiniteit
73,naglasak,accent
74,naglasiti,benadrukken
...,...,...
36196,živahnost,levensechtheid
36220,žmigati,knipperen
36240,žrtvovati,ontzegging
36260,gorak,bitter


In [None]:
df_final_us_p2 = df_final[['US', 'P2']]
df_final_us_p2.dropna(inplace=True)
df_final_us_p2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_us_p2.dropna(inplace=True)


Unnamed: 0,US,P2
11,poduzetan,actief
12,okretan,vlot
52,apsolvent,
64,proizvod,product
65,članak,artikel
...,...,...
12422,jadikovka 1,geklaag
12425,žalovati,rouwperiode
12461,oženiti I 1,tot vrouw nemen
12482,živahan,levendig


In [None]:
df_final_hw_p2 = df_final[['HW', 'P2']]
df_final_hw_p2.dropna(inplace=True)
df_final_hw_p2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_hw_p2.dropna(inplace=True)


Unnamed: 0,HW,P2
0,adaptacija,herinrichting
1,adaptacija,bewerking
2,adaptacija,aanpassing
4,adaptirati,zich aanpassen
5,adresar,adresboekje
...,...,...
12537,žvale,bit
12538,žvale,mondhoekkloofjes
12539,žvale,mond
12540,žvaliti,tongen


In [None]:
df_final[df_final['KG'].str.contains('za ', na=False)]

Unnamed: 0,original_index,HW,Level,Count,PR,KG,P2,LW,RZ,GR,...,TR,LS,VI,LJ,MN,ST,GS,AV,RE,PL
35,176,ambulanta,2,2,,bolnička soba za prijam,opnamekamer,d,,,...,,,,,,,,,,
48,274,aparatić,2,1,,aparatić za zube,beugel,d,,,...,,,,,,,,,,
136,706,bazen,2,1,,bazen za plivanje,zwembad,h,,,...,,,,,,,,,,
267,1085,blok,2,2,,blok za crtanje,tekenblok,h,,,...,,,,,,,,,,
268,1085,blok,2,3,,"blok za kupone, karte itd.",boekje,h,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11656,33922,vrtić,2,1,,vrtić za djecu,kleuterschool,d,,,...,krɛʃ,,,fr,,,,,,
11666,33964,vu|ći,1,1,,vu|ći za sobom,slepen,,,"tr, i",...,,fig,,,,,,,,
11796,34296,zagrijan,2,2,,zagrijan za što,enthousiast,,,,...,ɛntu'ʃɑst,coll,,,,,,,,
11910,34571,zalistak,2,3,,zalistak za knjigu,schutblad,h,,,...,,,,,,,,,,


In [None]:
mask = df_final['KG'].notna() & df_final['KG'].str.startswith('za ')
df_final.loc[mask, 'KG'] = df_final['HW'] + ' ' + df_final['KG']

In [None]:
df_final

Unnamed: 0,original_index,HW,Level,Count,PR,KG,P2,LW,RZ,GR,...,TR,LS,VI,LJ,MN,ST,GS,AV,RE,PL
0,9,adaptacija,2,1,,preuređenje,herinrichting,d,,,...,,,,,,,,,,
1,9,adaptacija,2,2,,obrad(b)a,bewerking,d,,,...,,,,,,,,,,
2,9,adaptacija,2,3,,prilagodba,aanpassing,d,,,...,,,,,,,,,,
3,10,adaptirati,1,1,,,,,,"tr, i/p",...,,,,,,,,,,
4,10,adaptirati,1,2,,,zich aanpassen,,,"refl, i/p",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12537,36307,žvale,2,1,,žvale za konja,bit,h,,,...,,,,,,,,,,
12538,36307,žvale,2,2,,,mondhoekkloofjes,d,,,...,,,,,,,,,,
12539,36307,žvale,2,3,,usta,mond,d,,,...,,pej,,,,,,,,
12540,36308,žvaliti,1,1,,,tongen,,,"refl, i",...,,sl,,,,,,,,


In [None]:
df_final_kg_p2 = df_final[['KG', 'P2']]
df_final_kg_p2.dropna(inplace=True)
df_final_kg_p2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_kg_p2.dropna(inplace=True)


Unnamed: 0,KG,P2
0,preuređenje,herinrichting
1,obrad(b)a,bewerking
2,prilagodba,aanpassing
5,notes,adresboekje
6,popis,adressenlijst
...,...,...
12530,žuti tisak,sensatiepers
12533,postajati žut,geel worden
12536,ptić,jonge vogel
12537,žvale za konja,bit


In [None]:
df_final.KG.value_counts()

osoba            61
koga             40
što              27
brojka           19
tramvaj i sl.    17
                 ..
loše slikati      1
žamoriti          1
žuboriti          1
mesno jelo        1
usta              1
Name: KG, Length: 5022, dtype: int64

In [None]:
df_final.PR.value_counts()

     12469
/        9
Name: PR, dtype: int64

In [None]:
combined_df_fr_pf

Unnamed: 0,FR,PF
0,od a do ž,van a tot z
1,"on radi puno, a opet ništa ne zaradi","hij werkt veel, maar toch verdient hij niets"
2,a-mol,"a-klein, a-mineur"
3,po abecedi,op alfabetische volgorde
4,ova primjedba ide na tvoju adresu,dit is een opmerking aan jouw adres
...,...,...
14617,operacija žuči,galblaasoperatie
14618,žučna rasprava,een felle discussie
14619,modni žurnal,modeblad
14620,filmski žurnal,filmjournaal


In [None]:
combined_df_fr_pf = combined_df_fr_pf.rename(columns={'FR': 'hr', 'PF': 'nl'})

In [None]:
df_final_kg_p2
df_final_us_p2
df_final_hw_p2

Unnamed: 0,HW,P2
0,adaptacija,herinrichting
1,adaptacija,bewerking
2,adaptacija,aanpassing
4,adaptirati,zich aanpassen
5,adresar,adresboekje
...,...,...
12537,žvale,bit
12538,žvale,mondhoekkloofjes
12539,žvale,mond
12540,žvaliti,tongen


In [None]:
df_with_empty_groups_hw_p2

Unnamed: 0,HW,P2
0,a,a
1,a,maar
2,a,a
3,abeced|a,alfabet
4,abecedni,alfabetisch
...,...,...
36302,žvačem,
36303,žvaka,kauwgum
36304,žvakać|i,kauw-
36305,žvakati,kauwen


In [None]:
df_empty_hw_p2 = df_with_empty_groups_hw_p2[['HW', 'P2']]
df_final_hw_p2.dropna(inplace=True)
df_final_hw_p2

In [None]:
df_final_kg_p2

Unnamed: 0,hr,nl
0,preuređenje,herinrichting
1,obrad(b)a,bewerking
2,prilagodba,aanpassing
5,notes,adresboekje
6,popis,adressenlijst
...,...,...
12530,žuti tisak,sensatiepers
12533,postajati žut,geel worden
12536,ptić,jonge vogel
12537,žvale za konja,bit


In [None]:
df_with_empty_groups_kg_p2 = df_with_empty_groups[['KG', 'P2']]
df_with_empty_groups_kg_p2.dropna(inplace=True)
df_with_empty_groups_kg_p2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_with_empty_groups_kg_p2.dropna(inplace=True)


Unnamed: 0,KG,P2
0,"slovo, glas",a
111,prostakuša,viswijf
174,ugođaj,sfeer
203,po analogiji,naar analogie (van/met)
220,anestetičar,anesthesist
...,...,...
36239,kamen,offersteen
36240,odricanje,ontzegging
36261,mjehur,blaar
36270,zgrada,provinciehuis


In [None]:
# List of  DataFrames
dataframes = [df_final_kg_p2, df_final_us_p2, df_final_hw_p2, df_with_empty_groups_hw_p2, df_with_empty_groups_us_p2, df_with_empty_groups_kg_p2]

# Dictionary for renaming columns
rename_dict = {'P2': 'nl'}  # Renaming 'p2' to 'nl'

# Loop through each DataFrame and rename the columns
for df in dataframes:
    # First, rename 'p2' to 'nl'
    df.rename(columns=rename_dict, inplace=True)

    # Then, rename other columns to 'hr', if they don't match 'nl'
    df.columns = ['hr' if col not in rename_dict.values() else col for col in df.columns]

# Now, each DataFrame in the list has its columns renamed accordingly

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns=rename_dict, inplace=True)


In [None]:
combined_df = pd.concat([
    df_final_kg_p2,
    df_final_us_p2,
    df_final_hw_p2,
    combined_df_fr_pf,
    df_with_empty_groups_hw_p2,
    df_with_empty_groups_us_p2,
    df_with_empty_groups_kg_p2
], ignore_index=True)

# If you need to reset the index
combined_df.reset_index(drop=True, inplace=True)

print(combined_df)

                hr             nl
0      preuređenje  herinrichting
1        obrad(b)a      bewerking
2       prilagodba     aanpassing
3            notes    adresboekje
4            popis  adressenlijst
...            ...            ...
65012        kamen     offersteen
65013    odricanje     ontzegging
65014       mjehur          blaar
65015       zgrada  provinciehuis
65016      površan        haastig

[65017 rows x 2 columns]


In [None]:
import pandas as pd

# Sample setup - replace this with  actual DataFrame combination code
dataframes = [df_final_kg_p2, df_final_us_p2, df_final_hw_p2, combined_df_fr_pf, df_with_empty_groups_hw_p2, df_with_empty_groups_us_p2]
combined_df = pd.concat(dataframes, ignore_index=True)

# Apply condition across both 'hr' and 'nl' columns
# Replace with empty strings if either column in a row has less than 1 character
mask = (combined_df['hr'].str.len() < 1) | (combined_df['nl'].str.len() < 1)
combined_df.loc[mask, ['hr', 'nl']] = ''

# If you need the rows with both columns having less than one character to be dropped instead:
# combined_df = combined_df[~((combined_df['hr'].str.len() < 1) | (combined_df['nl'].str.len() < 1))]

print(combined_df)

                hr              nl
0      preuređenje   herinrichting
1        obrad(b)a       bewerking
2       prilagodba      aanpassing
3            notes     adresboekje
4            popis   adressenlijst
...            ...             ...
63737    živahnost  levensechtheid
63738      žmigati       knipperen
63739    žrtvovati      ontzegging
63740        gorak          bitter
63741        tulum         feestje

[63742 rows x 2 columns]


In [None]:
combined_df.dropna(inplace=True)

In [None]:
combined_df

Unnamed: 0,hr,nl
0,preuređenje,herinrichting
1,obrad(b)a,bewerking
2,prilagodba,aanpassing
3,notes,adresboekje
4,popis,adressenlijst
...,...,...
63737,živahnost,levensechtheid
63738,žmigati,knipperen
63739,žrtvovati,ontzegging
63740,gorak,bitter


In [None]:
# Regular expression to match the characters () or |
pattern = r'[\(\)\|]'

# Apply the replacement across all string columns
for col in combined_df.select_dtypes(include=['object']):
    combined_df[col] = combined_df[col].str.replace(pattern, '', regex=True)

print(combined_df)

                hr              nl
0      preuređenje   herinrichting
1          obradba       bewerking
2       prilagodba      aanpassing
3            notes     adresboekje
4            popis   adressenlijst
...            ...             ...
63737    živahnost  levensechtheid
63738      žmigati       knipperen
63739    žrtvovati      ontzegging
63740        gorak          bitter
63741        tulum         feestje

[57818 rows x 2 columns]


In [None]:
combined_df.reset_index(drop=True, inplace=True)

In [None]:
combined_df

Unnamed: 0,hr,nl
0,preuređenje,herinrichting
1,obradba,bewerking
2,prilagodba,aanpassing
3,notes,adresboekje
4,popis,adressenlijst
...,...,...
57813,živahnost,levensechtheid
57814,žmigati,knipperen
57815,žrtvovati,ontzegging
57816,gorak,bitter


In [None]:
df_final_kg_p2 = df_final[[]]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
combined_df

Unnamed: 0,hr,nl
0,preuređenje,herinrichting
1,obradba,bewerking
2,prilagodba,aanpassing
3,notes,adresboekje
4,popis,adressenlijst
...,...,...
57813,živahnost,levensechtheid
57814,žmigati,knipperen
57815,žrtvovati,ontzegging
57816,gorak,bitter


In [None]:
combined_df.drop_duplicates(inplace=True)

In [None]:
combined_df

Unnamed: 0,hr,nl
0,preuređenje,herinrichting
1,obradba,bewerking
2,prilagodba,aanpassing
3,notes,adresboekje
4,popis,adressenlijst
...,...,...
56081,slikovit,beeldend
56082,živahnost,levensechtheid
56083,žmigati,knipperen
56084,žrtvovati,ontzegging


In [None]:
combined_df.reset_index(inplace=True, drop=True)

In [None]:
combined_df

Unnamed: 0,hr,nl
0,preuređenje,herinrichting
1,obradba,bewerking
2,prilagodba,aanpassing
3,notes,adresboekje
4,popis,adressenlijst
...,...,...
56081,slikovit,beeldend
56082,živahnost,levensechtheid
56083,žmigati,knipperen
56084,žrtvovati,ontzegging


In [None]:
combined_df.dropna(inplace=True)

In [None]:
combined_df.to_csv('/content/drive/MyDrive/combined_df.csv')