In [1]:
pip install pandas lxml


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [6]:
import gzip
import pandas as pd
from lxml import etree

def process_tmx_chunked(file_path, chunk_size=10000):
    # Create an empty list to store DataFrames
    chunks = []
    
    # Initialize variables for chunk processing
    current_chunk = []
    processed_count = 0
    
    # Use iterparse for memory-efficient parsing
    context = etree.iterparse(
        gzip.open(file_path), 
        events=('end',), 
        tag='tu'
    )
    
    for event, elem in context:
        try:
            variants = {}
            for tuv in elem.findall('tuv'):
                lang = tuv.get('{http://www.w3.org/XML/1998/namespace}lang')
                seg = tuv.find('seg')
                variants[lang] = seg.text if seg is not None else ''
            
            if 'en' in variants and 'sv' in variants:
                current_chunk.append({
                    'english': variants['en'],
                    'swedish': variants['sv']
                })
                processed_count += 1
                
                # When chunk is full, convert to DataFrame and store
                if len(current_chunk) >= chunk_size:
                    chunks.append(pd.DataFrame(current_chunk))
                    current_chunk = []
        finally:
            # Clear memory after processing each element
            elem.clear()
            while elem.getprevious() is not None:
                del elem.getparent()[0]
    
    # Add any remaining items in the last chunk
    if current_chunk:
        chunks.append(pd.DataFrame(current_chunk))
    
    # Combine all chunks into a single DataFrame
    final_df = pd.concat(chunks, ignore_index=True)
    
    print(f"Processed {processed_count} translation units")
    return final_df

# Usage:
df = process_tmx_chunked('en-sv.tmx.gz')
print(df.head())  # Now this will work

Processed 43533711 translation units
                                             english  \
0               Previously on The Hot Zone: Anthrax.   
1  Director Mueller just assigned us a major case...   
2  Investigation''s  officially been dubbed Ameri...   
3  Whoever sent these  letters got their Anthrax ...   
4  We wouldn''t be here if we didn''t have eviden...   

                                             swedish  
0                              I tidigare avsnitt...  
1      Byråchef Mueller gav oss just ett stort fall.  
2            Utredningen har fått namnet Amerithrax.  
3  Brevskickaren fick sin mjältbrand från ett ame...  
4  Vi hade inte varit här om inte bevisen pekat p...  


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43533711 entries, 0 to 43533710
Data columns (total 2 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   english  object
 1   swedish  object
dtypes: object(2)
memory usage: 664.3+ MB


In [9]:
df.head()

Unnamed: 0,english,swedish
0,Previously on The Hot Zone: Anthrax.,I tidigare avsnitt...
1,Director Mueller just assigned us a major case...,Byråchef Mueller gav oss just ett stort fall.
2,Investigation''s officially been dubbed Ameri...,Utredningen har fått namnet Amerithrax.
3,Whoever sent these letters got their Anthrax ...,Brevskickaren fick sin mjältbrand från ett ame...
4,We wouldn''t be here if we didn''t have eviden...,Vi hade inte varit här om inte bevisen pekat p...


In [8]:
df.tail()

Unnamed: 0,english,swedish
43533706,"You are already almost 15 minutes late. Oh, my...","-Gå nu, du är nästan en kvart sen."
43533707,By the powers vested in me by the state of Sou...,I kraft av mitt ämbete i staten South Carolina...
43533708,Who invited you? - I'm-- - Beat it.,Vem bjöd in dig?
43533709,Okay.,! Stick!
43533710,You may now kiss your bride.,Ni kan kyssa er brud.
