In [5]:
import xml.etree.ElementTree as ET
import pandas as pd

# Parse the XML file
tree = ET.parse('folkets_sv_en_public.xdxf.xml')
root = tree.getroot()

# Inspect the root to understand structure (optional)
# print(ET.tostring(root, encoding='unicode'))

# Extract data - adjust this depending on actual XML structure
entries = []
for ar in root.findall('.//ar'):  # 'ar' is the article tag in XDXF
    term = ar.find('k').text if ar.find('k') is not None else None
    definition = ''.join(ar.itertext()).strip()
    entries.append({'term': term, 'definition': definition})

# Create DataFrame
df = pd.DataFrame(entries)

# Show the first few rows
df.head()


KeyboardInterrupt: 

In [None]:
df.tail()

In [None]:
# Split the 'definition' column by '\n' and expand into two new columns
df[['swedish', 'english']] = df['definition'].str.split(r'\n\s+', n=1, expand=True)

# Drop the old 'definition' column if you want
df = df.drop(columns=['definition'])

# Reorder columns if needed
df = df[['term', 'swedish', 'english']]

# View the cleaned DataFrame
df.head()


In [None]:
'''
# Clean the 'english' column
def clean_english(text):
    if not isinstance(text, str):
        return text
    lines = text.split('\n')
    # Skip first 1-2 lines, keep the rest
    cleaned = ' '.join(lines[2:]).strip()
    return cleaned

df['english_clean'] = df['english'].apply(clean_english)

# View cleaned version
df[['term', 'swedish', 'english_clean']].head()
'''

In [None]:

import re

def better_clean(text):
    if not isinstance(text, str):
        return text
    lines = text.split('\n')
    # Remove lines that look like part of speech (like "pp", "nn", etc.)
    lines = [line for line in lines if not re.match(r'^[a-z]{1,2}$', line.strip())]
    # Remove lines that are pure IPA pronunciation (optional, if needed)
    lines = [line for line in lines if not re.match(r"^[a-zA-Zˈˌ:\.\s]+$", line.strip())]
    # Now, find the first line that seems like English (contains English words)
    for line in lines:
        if re.search(r'[a-zA-Z]', line) and not re.search(r'[åäöÅÄÖ]', line):
            return line.strip()
    return None

df['english_clean'] = df['english'].apply(better_clean)

# View results
df[['term', 'swedish', 'english_clean']].head()


In [None]:
df.tail()

In [None]:
df.head()

In [None]:
import pandas as pd
import numpy as np
import re

def clean_dataframe(df):
    # Reset index
    df = df.reset_index(drop=True)
    
    # Custom cleaning function
    def clean_text(text):
        if pd.isna(text):
            return text
        # Keep only content before first \n
        cleaned = str(text).split('\n')[0]
        # Remove word tags at start (pp, nn, ab, etc.)
        cleaned = re.sub(r'^\w+\s', '', cleaned)
        return cleaned.strip()
    
    # Apply cleaning
    df['english'] = df['english'].apply(clean_text)
    
    # Preserve good english_clean values
    df['english'] = np.where(
        df['english_clean'].notna() & (df['english_clean'] != df['english']),
        df['english_clean'],
        df['english']
    )
    df['english_clean'] = df['english']
    
    # Standardize
    df['term'] = df['term'].str.lower()
    df['swedish'] = df['swedish'].str.lower()
    
    return df.drop_duplicates(subset=['term'])

# Apply cleaning
df = clean_dataframe(df)

# Ensure full content displays
pd.set_option('display.max_colwidth', None)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.loc[10]

In [12]:
df.loc[13]

term                                  abborre
swedish                               abborre
english          abborr|pinnevery small perch
english_clean    abborr|pinnevery small perch
Name: 13, dtype: object

In [1]:
import xml.etree.ElementTree as ET
tree = ET.parse("en-sv.xml.gz")
for seg in tree.findall(".//seg"):
    print(seg.text)  # Extract Swedish/English text

ParseError: not well-formed (invalid token): line 1, column 0 (<string>)

In [2]:
import gzip
import xml.etree.ElementTree as ET

# Decompress the .xml.gz file
with gzip.open("en-sv.xml.gz", "rb") as gz_file:
    with open("en-sv.xml", "wb") as xml_file:
        xml_file.write(gz_file.read())

# Now parse the decompressed XML
tree = ET.parse("en-sv.xml")
root = tree.getroot()

In [None]:
from lxml import etree
import gzip

with gzip.open("en-sv.xml.gz", "rb") as f:
    tree = etree.parse(f)

In [None]:
df.info()