In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

# Parse the XML file
tree = ET.parse('folkets_sv_en_public.xdxf.xml')
root = tree.getroot()

# Inspect the root to understand structure (optional)
# print(ET.tostring(root, encoding='unicode'))

# Extract data - adjust this depending on actual XML structure
entries = []
for ar in root.findall('.//ar'):  # 'ar' is the article tag in XDXF
    term = ar.find('k').text if ar.find('k') is not None else None
    definition = ''.join(ar.itertext()).strip()
    entries.append({'term': term, 'definition': definition})

# Create DataFrame
df = pd.DataFrame(entries)

# Show the first few rows
df.head()


Unnamed: 0,term,definition
0,à,à\n \n pp\n at\n a\n \n...
1,à,à\n \n pp\n to\n a\n \n...
2,a,a\n \n nn\n a\n a:\n \n...
3,a,a\n \n nn\n A\n a:\n \n...
4,a conto,a conto\n \n ab\n on account\n ...


In [2]:
df.tail()

Unnamed: 0,term,definition
39549,smickrare,smickrare\n \n flatterer
39550,tätna,tätna\n \n become dense
39551,utvakad,utvakad\n \n tired
39552,Tala illa om,Tala illa om\n \n Vilified
39553,hjorthornssalt,"hjorthornssalt\n \n hartshorn, ammoniu..."


In [3]:
# Split the 'definition' column by '\n' and expand into two new columns
df[['swedish', 'english']] = df['definition'].str.split(r'\n\s+', n=1, expand=True)

# Drop the old 'definition' column if you want
df = df.drop(columns=['definition'])

# Reorder columns if needed
df = df[['term', 'swedish', 'english']]

# View the cleaned DataFrame
df.head()


Unnamed: 0,term,swedish,english
0,à,à,pp\n at\n a\n \n två koppa...
1,à,à,pp\n to\n a\n \n 2 à 3 met...
2,a,a,nn\n a\n a:\n \n har man s...
3,a,a,nn\n A\n a:\n \n sjätte to...
4,a conto,a conto,ab\n on account\n ak'ån:to\n \n...


In [4]:
'''
# Clean the 'english' column
def clean_english(text):
    if not isinstance(text, str):
        return text
    lines = text.split('\n')
    # Skip first 1-2 lines, keep the rest
    cleaned = ' '.join(lines[2:]).strip()
    return cleaned

df['english_clean'] = df['english'].apply(clean_english)

# View cleaned version
df[['term', 'swedish', 'english_clean']].head()
'''

"\n# Clean the 'english' column\ndef clean_english(text):\n    if not isinstance(text, str):\n        return text\n    lines = text.split('\n')\n    # Skip first 1-2 lines, keep the rest\n    cleaned = ' '.join(lines[2:]).strip()\n    return cleaned\n\ndf['english_clean'] = df['english'].apply(clean_english)\n\n# View cleaned version\ndf[['term', 'swedish', 'english_clean']].head()\n"

In [5]:

import re

def better_clean(text):
    if not isinstance(text, str):
        return text
    lines = text.split('\n')
    # Remove lines that look like part of speech (like "pp", "nn", etc.)
    lines = [line for line in lines if not re.match(r'^[a-z]{1,2}$', line.strip())]
    # Remove lines that are pure IPA pronunciation (optional, if needed)
    lines = [line for line in lines if not re.match(r"^[a-zA-Zˈˌ:\.\s]+$", line.strip())]
    # Now, find the first line that seems like English (contains English words)
    for line in lines:
        if re.search(r'[a-zA-Z]', line) and not re.search(r'[åäöÅÄÖ]', line):
            return line.strip()
    return None

df['english_clean'] = df['english'].apply(better_clean)

# View results
df[['term', 'swedish', 'english_clean']].head()


Unnamed: 0,term,swedish,english_clean
0,à,à,
1,à,à,2 à 3 meter2 to 3 metres
2,a,a,"a och o (""det viktigaste"")alpha and omega"
3,a,a,
4,a conto,a conto,


In [6]:
df.tail()

Unnamed: 0,term,swedish,english,english_clean
39549,smickrare,smickrare,flatterer,
39550,tätna,tätna,become dense,
39551,utvakad,utvakad,tired,
39552,Tala illa om,Tala illa om,Vilified,
39553,hjorthornssalt,hjorthornssalt,"hartshorn, ammonium carbonate","hartshorn, ammonium carbonate"


In [7]:
df.head()

Unnamed: 0,term,swedish,english,english_clean
0,à,à,pp\n at\n a\n \n två koppa...,
1,à,à,pp\n to\n a\n \n 2 à 3 met...,2 à 3 meter2 to 3 metres
2,a,a,nn\n a\n a:\n \n har man s...,"a och o (""det viktigaste"")alpha and omega"
3,a,a,nn\n A\n a:\n \n sjätte to...,
4,a conto,a conto,ab\n on account\n ak'ån:to\n \n...,


In [8]:
import pandas as pd
import numpy as np
import re

def clean_dataframe(df):
    # Reset index
    df = df.reset_index(drop=True)
    
    # Custom cleaning function
    def clean_text(text):
        if pd.isna(text):
            return text
        # Keep only content before first \n
        cleaned = str(text).split('\n')[0]
        # Remove word tags at start (pp, nn, ab, etc.)
        cleaned = re.sub(r'^\w+\s', '', cleaned)
        return cleaned.strip()
    
    # Apply cleaning
    df['english'] = df['english'].apply(clean_text)
    
    # Preserve good english_clean values
    df['english'] = np.where(
        df['english_clean'].notna() & (df['english_clean'] != df['english']),
        df['english_clean'],
        df['english']
    )
    df['english_clean'] = df['english']
    
    # Standardize
    df['term'] = df['term'].str.lower()
    df['swedish'] = df['swedish'].str.lower()
    
    return df.drop_duplicates(subset=['term'])

# Apply cleaning
df = clean_dataframe(df)

# Ensure full content displays
pd.set_option('display.max_colwidth', None)

In [9]:
df.head()

Unnamed: 0,term,swedish,english,english_clean
0,à,à,pp,pp
2,a,a,"a och o (""det viktigaste"")alpha and omega","a och o (""det viktigaste"")alpha and omega"
4,a conto,a conto,ab,ab
5,à jour,à jour,up-to-date,up-to-date
6,à la,à la,à la,à la


In [10]:
df.tail()

Unnamed: 0,term,swedish,english,english_clean
39549,smickrare,smickrare,flatterer,flatterer
39550,tätna,tätna,dense,dense
39551,utvakad,utvakad,tired,tired
39552,tala illa om,tala illa om,Vilified,Vilified
39553,hjorthornssalt,hjorthornssalt,"hartshorn, ammonium carbonate","hartshorn, ammonium carbonate"


In [12]:
df.loc[10]

term                    ab svenska spel
swedish                 ab svenska spel
english          ²'a:be: sven:ska spe:l
english_clean    ²'a:be: sven:ska spe:l
Name: 10, dtype: object

In [15]:
df.loc[13]

term                                  abborre
swedish                               abborre
english          abborr|pinnevery small perch
english_clean    abborr|pinnevery small perch
Name: 13, dtype: object