In [1]:
pip install pandas lxml


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [6]:
import gzip
import pandas as pd
from lxml import etree

def process_tmx_chunked(file_path, chunk_size=10000):
    # Create an empty list to store DataFrames
    chunks = []
    
    # Initialize variables for chunk processing
    current_chunk = []
    processed_count = 0
    
    # Use iterparse for memory-efficient parsing
    context = etree.iterparse(
        gzip.open(file_path), 
        events=('end',), 
        tag='tu'
    )
    
    for event, elem in context:
        try:
            variants = {}
            for tuv in elem.findall('tuv'):
                lang = tuv.get('{http://www.w3.org/XML/1998/namespace}lang')
                seg = tuv.find('seg')
                variants[lang] = seg.text if seg is not None else ''
            
            if 'en' in variants and 'sv' in variants:
                current_chunk.append({
                    'english': variants['en'],
                    'swedish': variants['sv']
                })
                processed_count += 1
                
                # When chunk is full, convert to DataFrame and store
                if len(current_chunk) >= chunk_size:
                    chunks.append(pd.DataFrame(current_chunk))
                    current_chunk = []
        finally:
            # Clear memory after processing each element
            elem.clear()
            while elem.getprevious() is not None:
                del elem.getparent()[0]
    
    # Add any remaining items in the last chunk
    if current_chunk:
        chunks.append(pd.DataFrame(current_chunk))
    
    # Combine all chunks into a single DataFrame
    final_df = pd.concat(chunks, ignore_index=True)
    
    print(f"Processed {processed_count} translation units")
    return final_df

# Usage:
df = process_tmx_chunked('en-sv.tmx.gz')
print(df.head())  # Now this will work

Processed 43533711 translation units
                                             english  \
0               Previously on The Hot Zone: Anthrax.   
1  Director Mueller just assigned us a major case...   
2  Investigation''s  officially been dubbed Ameri...   
3  Whoever sent these  letters got their Anthrax ...   
4  We wouldn''t be here if we didn''t have eviden...   

                                             swedish  
0                              I tidigare avsnitt...  
1      Byråchef Mueller gav oss just ett stort fall.  
2            Utredningen har fått namnet Amerithrax.  
3  Brevskickaren fick sin mjältbrand från ett ame...  
4  Vi hade inte varit här om inte bevisen pekat p...  


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43533711 entries, 0 to 43533710
Data columns (total 2 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   english  object
 1   swedish  object
dtypes: object(2)
memory usage: 664.3+ MB


In [9]:
df.head()

Unnamed: 0,english,swedish
0,Previously on The Hot Zone: Anthrax.,I tidigare avsnitt...
1,Director Mueller just assigned us a major case...,Byråchef Mueller gav oss just ett stort fall.
2,Investigation''s officially been dubbed Ameri...,Utredningen har fått namnet Amerithrax.
3,Whoever sent these letters got their Anthrax ...,Brevskickaren fick sin mjältbrand från ett ame...
4,We wouldn''t be here if we didn''t have eviden...,Vi hade inte varit här om inte bevisen pekat p...


In [2]:
df.tail()

NameError: name 'df' is not defined

In [11]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [13]:
pip install keras-nlp

Defaulting to user installation because normal site-packages is not writeable
Collecting keras-nlp
  Downloading keras_nlp-0.20.0-py3-none-any.whl (2.1 kB)
Collecting keras-hub==0.20.0
  Downloading keras_hub-0.20.0-py3-none-any.whl (792 kB)
     -------------------------------------- 792.1/792.1 kB 2.6 MB/s eta 0:00:00
Collecting kagglehub
  Downloading kagglehub-0.3.12-py3-none-any.whl (67 kB)
     ---------------------------------------- 68.0/68.0 kB ? eta 0:00:00
Installing collected packages: kagglehub, keras-hub, keras-nlp
Successfully installed kagglehub-0.3.12 keras-hub-0.20.0 keras-nlp-0.20.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split

# 1. Clean the data first
def clean_data(df):
    # Remove rows with None/NaN in either column
    df = df.dropna(subset=['swedish', 'english'])
    
    # Convert to string and strip whitespace
    df['swedish'] = df['swedish'].astype(str).str.strip()
    df['english'] = df['english'].astype(str).str.strip()
    
    # Remove empty strings
    df = df[(df['swedish'] != '') & (df['english'] != '')]
    
    return df

# Clean your DataFrame
df_clean = clean_data(df)

# 2. Modified Transformer Translator with better error handling
class RobustTranslator(keras.Model):
    def __init__(self, max_vocab_size=20000, max_length=50, embed_dim=256):
        super().__init__()
        
        # Swedish encoder with output mode='int'
        self.sv_encoder = keras.layers.TextVectorization(
            max_tokens=max_vocab_size,
            output_sequence_length=max_length,
            output_mode='int'
        )
        
        # English decoder with output mode='int'
        self.en_decoder = keras.layers.TextVectorization(
            max_tokens=max_vocab_size,
            output_sequence_length=max_length,
            output_mode='int'
        )
        
        # Embedding layers
        self.sv_embedding = keras.layers.Embedding(max_vocab_size, embed_dim)
        self.en_embedding = keras.layers.Embedding(max_vocab_size, embed_dim)
        
        # Encoder (BiLSTM)
        self.encoder = keras.layers.Bidirectional(
            keras.layers.LSTM(embed_dim, return_sequences=True))
        
        # Attention layer
        self.attention = keras.layers.Attention()
        
        # Decoder (LSTM)
        self.decoder = keras.layers.LSTM(embed_dim, return_sequences=True)
        
        # Output layer
        self.output_layer = keras.layers.Dense(max_vocab_size, activation='softmax')
    
    def call(self, inputs):
        sv_text, en_text = inputs
        
        # Encode Swedish
        sv_tokens = self.sv_encoder(sv_text)
        sv_emb = self.sv_embedding(sv_tokens)
        encoded = self.encoder(sv_emb)
        
        # Decode English
        en_tokens = self.en_decoder(en_text)
        en_emb = self.en_embedding(en_tokens)
        
        # Attention
        context = self.attention([en_emb, encoded])
        
        # Decode
        decoded = self.decoder(context)
        return self.output_layer(decoded)

# 3. Prepare data
train_df, val_df = train_test_split(df_clean, test_size=0.2)

# Initialize model
translator = RobustTranslator()

# Adapt the vectorizers - now using .tolist() to ensure proper conversion
translator.sv_encoder.adapt(train_df['swedish'].tolist())
translator.en_decoder.adapt(train_df['english'].tolist())

# 4. Data preparation with proper padding
def prepare_data(sv_text, en_text):
    # Convert to lists to avoid Tensor conversion issues
    sv_text = sv_text.tolist()
    en_text = en_text.tolist()
    
    # Get sequences
    sv_seq = translator.sv_encoder(sv_text)
    en_seq = translator.en_decoder(en_text)
    
    # Teacher forcing setup
    return (sv_seq, en_seq[:, :-1]), en_seq[:, 1:]

# Create datasets
train_ds = tf.data.Dataset.from_tensor_slices(
    (train_df['swedish'], train_df['english'])
).batch(32).map(prepare_data)

val_ds = tf.data.Dataset.from_tensor_slices(
    (val_df['swedish'], val_df['english'])
).batch(32).map(prepare_data)

# 5. Compile and train
translator.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

history = translator.fit(
    train_ds,
    validation_data=val_ds,
    epochs=5
)

NameError: name 'df' is not defined

In [14]:
import keras_nlp
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split

class TransformerTranslator(keras.Model):
    def __init__(self, max_vocab_size=20000, max_length=50, embed_dim=256):
        super().__init__()
        
        # Swedish encoder
        self.sv_encoder = keras.layers.TextVectorization(
            max_tokens=max_vocab_size, output_sequence_length=max_length)
        
        # English decoder
        self.en_decoder = keras.layers.TextVectorization(
            max_tokens=max_vocab_size, output_sequence_length=max_length)
        
        # Embedding layers
        self.sv_embedding = keras.layers.Embedding(max_vocab_size, embed_dim)
        self.en_embedding = keras.layers.Embedding(max_vocab_size, embed_dim)
        
        # Transformer
        self.transformer = keras_nlp.layers.TransformerEncoder(
            num_heads=8,
            intermediate_dim=512,
            dropout=0.1
        )
        
        # Output layer
        self.output_layer = keras.layers.Dense(max_vocab_size, activation='softmax')
    
    def call(self, inputs):
        sv_text, en_text = inputs
        
        # Encode Swedish
        sv_tokens = self.sv_encoder(sv_text)
        sv_emb = self.sv_embedding(sv_tokens)
        encoded = self.transformer(sv_emb)
        
        # Decode English
        en_tokens = self.en_decoder(en_text)
        en_emb = self.en_embedding(en_tokens)
        
        # Simple cross-attention (for demo purposes)
        attention = tf.matmul(en_emb, encoded, transpose_b=True)
        attention = tf.nn.softmax(attention)
        context = tf.matmul(attention, encoded)
        
        combined = tf.concat([en_emb, context], axis=-1)
        return self.output_layer(combined)

# Usage
train_df, val_df = train_test_split(df, test_size=0.2)
translator = TransformerTranslator()

# Adapt the text vectorizers
translator.sv_encoder.adapt(train_df['swedish'])
translator.en_decoder.adapt(train_df['english'])

# Compile and train
translator.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Prepare data - need to shift decoder inputs for teacher forcing
def prepare_data(sv_text, en_text):
    # Convert texts to sequences
    sv_seq = translator.sv_encoder(sv_text)
    en_seq = translator.en_decoder(en_text)
    
    # For teacher forcing: decoder_inputs is en_seq[:-1], targets are en_seq[1:]
    return (sv_seq, en_seq[:, :-1]), en_seq[:, 1:]

# Create tf.data.Dataset
train_ds = tf.data.Dataset.from_tensor_slices(
    (train_df['swedish'], train_df['english'])
).batch(32).map(prepare_data)

val_ds = tf.data.Dataset.from_tensor_slices(
    (val_df['swedish'], val_df['english'])
).batch(32).map(prepare_data)

# Train
translator.fit(train_ds, validation_data=val_ds, epochs=5)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type NoneType).