In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:

Path = '/content/drive/MyDrive/MLT+Dataset/Dataset/'

In [3]:
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import os
from nltk.stem import PorterStemmer

In [None]:

# Load each English and German file
german_files = ["commoncrawl_de_en.txt", "europarl-v7_de_en.txt", "news-commentary-v9_de_en.txt"]
english_files = ["commoncrawl_en_de.txt", "europarl-v7_en_de.txt", "news-commentary-v9_en_de.txt"]

In [None]:

# Read English and German text files
with open(Path + 'commoncrawl_en_de.txt', 'r', encoding='utf-8') as eng_file:
    english_sentences = eng_file.readlines()

with open(Path + 'commoncrawl_de_en.txt', 'r', encoding='utf-8') as ger_file:
    german_sentences = ger_file.readlines()

# Strip any unnecessary whitespace (e.g., newline characters)
commoncrawl_english_sentences = [sentence.strip() for sentence in english_sentences]
commoncrawl_german_sentences = [sentence.strip() for sentence in german_sentences]

In [None]:
if len(commoncrawl_english_sentences) != len(commoncrawl_german_sentences):
    raise ValueError("The number of sentences in the English and German files do not match.")

In [None]:
commoncrawl_en_de_df = pd.DataFrame({
    'English': commoncrawl_english_sentences,
    'German': commoncrawl_german_sentences
})

In [None]:
commoncrawl_en_de_df.shape

(2399123, 2)

In [None]:
# Read English and German text files
with open(Path + 'europarl-v7_en_de.txt', 'r', encoding='utf-8') as eng_file:
    english_sentences = eng_file.readlines()

with open(Path + 'europarl-v7_de_en.txt', 'r', encoding='utf-8') as ger_file:
    german_sentences = ger_file.readlines()

# Strip any unnecessary whitespace (e.g., newline characters)
europarl_english_sentences = [sentence.strip() for sentence in english_sentences]
europarl_german_sentences = [sentence.strip() for sentence in german_sentences]

In [None]:
if len(europarl_english_sentences) != len(europarl_german_sentences):
    raise ValueError("The number of sentences in the English and German files do not match.")

In [None]:

europarl_en_de_df = pd.DataFrame({
    'English': europarl_english_sentences,
    'German': europarl_english_sentences
})

In [None]:
europarl_en_de_df.shape

(1920209, 2)

In [None]:
# commentary_english_sentences = [sentence.strip() for sentence in english_sentences if sentence.strip()]
# commentary_german_sentences = [sentence.strip() for sentence in german_sentences if sentence.strip()]

In [None]:
# Read English and German text files
# cleaning data for news comm in german
def is_special_characters(line):
    return re.match(r'^[^a-zA-Z0-9\s]+$', line) is not None

commentary_english_sentences = []
commentary_german_sentences = []

with open(Path + 'news-commentary-v9_en_de.txt', 'r', encoding='utf-8') as file:
  for line in file:
        cleaned_line = line.strip()

        if cleaned_line and not cleaned_line.isdigit() and not is_special_characters(cleaned_line):
            # Replace multiple spaces with a single space
            cleaned_line = re.sub(r'\s+', ' ', cleaned_line)
            commentary_english_sentences.append(cleaned_line)

with open(Path + 'news-commentary-v9_en_de.txt', 'r', encoding='utf-8') as file:
  for line in file:
        cleaned_line = line.strip()

        if cleaned_line and not cleaned_line.isdigit() and not is_special_characters(cleaned_line):
            # Replace multiple spaces with a single space
            cleaned_line = re.sub(r'\s+', ' ', cleaned_line)
            commentary_german_sentences.append(cleaned_line)

In [None]:
if len(commentary_english_sentences) != len(commentary_german_sentences):
    raise ValueError("The number of sentences in the English and German files do not match.")

In [None]:
commentary_en_de_df = pd.DataFrame({
    'English': commentary_english_sentences,
    'German': commentary_german_sentences
})

In [None]:
commentary_en_de_df.shape

(201553, 2)

In [None]:
final_df = pd.concat([commoncrawl_en_de_df, europarl_en_de_df, commentary_en_de_df], axis=0, ignore_index=True)

In [None]:
final_df.shape

(4520885, 2)

In [None]:
# Cleaning

In [None]:
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters and punctuation (except apostrophes in contractions)
    text = re.sub(r"[^a-zA-ZäöüßÄÖÜéèàùâêîôûç'\s]", '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning function to both columns
final_df['English'] = final_df['English'].apply(clean_text)
final_df['German'] = final_df['German'].apply(clean_text)

# Display cleaned DataFrame
print(final_df)

                                                   English  \
0        iron cement is a ready for use paste which is ...   
1        iron cement protects the ingot against the hot...   
2        a fire restant repair cement for fire places o...   
3                  construction and repair of highways and   
4             an announcement must be commercial character   
...                                                    ...   
4520880  their achievement remains one of the greatest ...   
4520881  at the same time zumas revolutionary generatio...   
4520882  in a region that reveres the elderly zumas att...   
4520883  three in ten south africans are younger than m...   
4520884  somehow zuma must find a way to honor his own ...   

                                                    German  
0        iron cement ist eine gebrauchsfertige paste di...  
1        nach der aushärtung schützt iron cement die ko...  
2        feuerfester reparaturkitt für feuerungsanlagen...  
3          

In [None]:
final_df.head()

Unnamed: 0,English,German
0,iron cement is a ready for use paste which is ...,iron cement ist eine gebrauchsfertige paste di...
1,iron cement protects the ingot against the hot...,nach der aushärtung schützt iron cement die ko...
2,a fire restant repair cement for fire places o...,feuerfester reparaturkitt für feuerungsanlagen...
3,construction and repair of highways and,der bau und die reparatur der autostraßen
4,an announcement must be commercial character,die mitteilungen sollen den geschäftlichen kom...


In [None]:
# Preprocess

In [None]:
import nltk
nltk.download('punkt_tab')

from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:

nltk.download('stopwords')

from nltk.corpus import stopwords

german_stop_words = stopwords.words('german')
english_stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
chunk_size = 500000  # Adjust this based on available memory
max_seq_length = 50  # Adjust based on data analysis
output_dir = '/content/data1117/preprocessed_chunks/'
os.makedirs(output_dir, exist_ok=True)

# Initialize tokenizers
stemmer = PorterStemmer()
english_tokenizer = Tokenizer()
german_tokenizer = Tokenizer()

# Split the DataFrame into chunks manually
num_chunks = len(final_df) // chunk_size + (1 if len(final_df) % chunk_size != 0 else 0)

# Step 1: Fit Tokenizers Across Chunks
for i in range(num_chunks):
    print("num_chunk----->", i)
    chunk = final_df.iloc[i * chunk_size:(i + 1) * chunk_size]
    chunk['german_tokens'] = chunk['German'].apply(word_tokenize)
    chunk['english_tokens'] = chunk['English'].apply(word_tokenize)
    chunk['german_tokens'] = chunk['german_tokens'].apply(lambda tokens: [token for token in tokens if token not in german_stop_words])
    chunk['english_tokens'] = chunk['english_tokens'].apply(lambda tokens: [token for token in tokens if token not in english_stop_words])

    chunk['german_tokens'] = chunk['german_tokens'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])
    chunk['english_tokens'] = chunk['english_tokens'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])
    chunk['German'] = chunk['german_tokens'].apply(lambda tokens: ' '.join(tokens))
    chunk['English'] = chunk['english_tokens'].apply(lambda tokens: ' '.join(tokens))

    # Update tokenizers
    english_tokenizer.fit_on_texts(chunk['English'])
    german_tokenizer.fit_on_texts(chunk['German'])

    # Convert to sequences
    english_sequences = english_tokenizer.texts_to_sequences(chunk['English'])
    german_sequences = german_tokenizer.texts_to_sequences(chunk['German'])

    # Pad sequences
    english_padded = pad_sequences(english_sequences, maxlen=max_seq_length, padding='post')
    german_padded = pad_sequences(german_sequences, maxlen=max_seq_length, padding='post')

    # Save each chunk
    np.save(os.path.join(output_dir, f'english_chunk_{i}.npy'), english_padded)
    np.save(os.path.join(output_dir, f'german_chunk_{i}.npy'), german_padded)

print("Data processing in chunks completed. Tokenized sequences are saved to disk.")


num_chunk-----> 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['german_tokens'] = chunk['German'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['english_tokens'] = chunk['English'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['german_tokens'] = chunk['german_tokens'].apply(lambda tokens: [token for

num_chunk-----> 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['german_tokens'] = chunk['German'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['english_tokens'] = chunk['English'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['german_tokens'] = chunk['german_tokens'].apply(lambda tokens: [token for

num_chunk-----> 2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['german_tokens'] = chunk['German'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['english_tokens'] = chunk['English'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['german_tokens'] = chunk['german_tokens'].apply(lambda tokens: [token for

num_chunk-----> 3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['german_tokens'] = chunk['German'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['english_tokens'] = chunk['English'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['german_tokens'] = chunk['german_tokens'].apply(lambda tokens: [token for

num_chunk-----> 4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['german_tokens'] = chunk['German'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['english_tokens'] = chunk['English'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['german_tokens'] = chunk['german_tokens'].apply(lambda tokens: [token for

num_chunk-----> 5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['german_tokens'] = chunk['German'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['english_tokens'] = chunk['English'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['german_tokens'] = chunk['german_tokens'].apply(lambda tokens: [token for

num_chunk-----> 6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['german_tokens'] = chunk['German'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['english_tokens'] = chunk['English'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['german_tokens'] = chunk['german_tokens'].apply(lambda tokens: [token for

num_chunk-----> 7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['german_tokens'] = chunk['German'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['english_tokens'] = chunk['English'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['german_tokens'] = chunk['german_tokens'].apply(lambda tokens: [token for

num_chunk-----> 8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['german_tokens'] = chunk['German'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['english_tokens'] = chunk['English'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['german_tokens'] = chunk['german_tokens'].apply(lambda tokens: [token for

num_chunk-----> 9


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['german_tokens'] = chunk['German'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['english_tokens'] = chunk['English'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['german_tokens'] = chunk['german_tokens'].apply(lambda tokens: [token for

Data processing in chunks completed. Tokenized sequences are saved to disk.


In [None]:
# Initialize lists to store all data
english_data = []
german_data = []
output_dir = '/content/data1117/preprocessed_chunks/'
# Iterate through all saved chunks
num_chunks = len([name for name in os.listdir(output_dir) if name.startswith('english_chunk_')])

for i in range(num_chunks):
    # Load the English and German chunks
    english_chunk = np.load(os.path.join(output_dir, f'english_chunk_{i}.npy'))
    german_chunk = np.load(os.path.join(output_dir, f'german_chunk_{i}.npy'))

    # Append to the list
    english_data.extend(english_chunk)
    german_data.extend(german_chunk)

# Convert lists to DataFrame
preprocessed_data = pd.DataFrame({
    'English': english_data,
    'German': german_data
})

print("Combined DataFrame created successfully.")
print(preprocessed_data.head())

Combined DataFrame created successfully.
                                             English  \
0  [1014, 7639, 1353, 3, 785, 2867, 18320, 22494,...   
1  [1014, 7639, 342, 13991, 872, 14392, 2611, 143...   
2  [1126, 27942, 1884, 7639, 1126, 40, 4250, 80, ...   
3  [810, 1884, 1898, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...   
4  [723, 189, 850, 416, 0, 0, 0, 0, 0, 0, 0, 0, 0...   

                                              German  
0  [5395, 49077, 70794, 7561, 109324, 17027, 1630...  
1  [163055, 4128, 5395, 49077, 109326, 11775, 109...  
2  [61269, 163056, 163057, 24008, 813, 70795, 321...  
3  [2008, 4129, 70796, 0, 0, 0, 0, 0, 0, 0, 0, 0,...  
4  [7877, 515, 5812, 4362, 1522, 1308, 0, 0, 0, 0...  


In [None]:
preprocessed_data.tail()

Unnamed: 0,English,German
4520880,"[206, 308, 6, 1614, 293, 581, 0, 0, 0, 0, 0, 0...","[51, 238, 436, 55, 2, 1, 2300, 384, 1176, 0, 0..."
4520881,"[9, 23183, 3490, 95, 149, 525, 10374, 272, 639...","[25, 1, 196, 70, 41833, 8285, 202, 221, 584, 2..."
4520882,"[91, 9076, 3174, 23183, 1602, 1158, 417, 14, 1...","[5, 126, 6, 31423, 1, 4575, 41833, 2795, 3, 19..."
4520883,"[242, 1061, 639, 1595, 3586, 104, 175, 81, 954...","[454, 1369, 1212, 2037, 16, 9775, 111, 180, 6,..."
4520884,"[4363, 23183, 14, 76, 52, 3019, 95, 352, 5533,...","[8557, 41833, 37, 334, 5, 119, 3, 12529, 197, ..."


In [None]:
preprocessed_data.to_parquet('/content/drive/MyDrive/capstone_preprocess.parquet')

In [4]:
df = pd.read_parquet('/content/drive/MyDrive/capstone_preprocess.parquet')


In [5]:
df.head()

Unnamed: 0,English,German
0,"[1014, 7639, 1353, 3, 785, 2867, 18320, 22494,...","[5395, 49077, 70794, 7561, 109324, 17027, 1630..."
1,"[1014, 7639, 342, 13991, 872, 14392, 2611, 143...","[163055, 4128, 5395, 49077, 109326, 11775, 109..."
2,"[1126, 27942, 1884, 7639, 1126, 40, 4250, 80, ...","[61269, 163056, 163057, 24008, 813, 70795, 321..."
3,"[810, 1884, 1898, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[2008, 4129, 70796, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,"[723, 189, 850, 416, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[7877, 515, 5812, 4362, 1522, 1308, 0, 0, 0, 0..."


In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Extract English and German columns
english_sequences = df['English'].tolist()
german_sequences = df['German'].tolist()

# Determine the maximum sequence length (optional if sequences already have padding)
max_seq_len = max(
    max(len(seq) for seq in english_sequences),
    max(len(seq) for seq in german_sequences)
)

# Pad sequences
X_english = pad_sequences(english_sequences, maxlen=max_seq_len, padding='post')
X_german = pad_sequences(german_sequences, maxlen=max_seq_len, padding='post')

# Print shapes to confirm
print(f"Shape of English sequences: {X_english.shape}")
print(f"Shape of German sequences: {X_german.shape}")


Shape of English sequences: (4520885, 50)
Shape of German sequences: (4520885, 50)


In [7]:
# Calculate the vocabulary size (maximum token index + 1)
vocab_size_english = np.max(X_english) + 1
vocab_size_german = np.max(X_german) + 1

print(f"Vocabulary size (English): {vocab_size_english}")
print(f"Vocabulary size (German): {vocab_size_german}")


Vocabulary size (English): 677947
Vocabulary size (German): 1403218


In [8]:
from sklearn.model_selection import train_test_split

# Split the data (80% train, 20% test)
X_train_english, X_test_english, X_train_german, X_test_german = train_test_split(
    X_english, X_german, test_size=0.2, random_state=42
)

print(f"Training data shape: {X_train_english.shape}, {X_train_german.shape}")
print(f"Testing data shape: {X_test_english.shape}, {X_test_german.shape}")


Training data shape: (3616708, 50), (3616708, 50)
Testing data shape: (904177, 50), (904177, 50)


In [9]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, SimpleRNN, Dense, TimeDistributed

# Assuming X_train_english and X_train_german are already padded sequences

# Hyperparameters
embedding_dim = 128
vocab_size_english = len(np.unique(np.concatenate(X_train_english))) + 1  # Add 1 for padding token
vocab_size_german = len(np.unique(np.concatenate(X_train_german))) + 1  # Add 1 for padding token
max_seq_len = max([len(seq) for seq in X_train_english])  # Assuming max_seq_len is the same for both languages

# Define Encoder
encoder_input = Input(shape=(max_seq_len,))
encoder_embedding = Embedding(input_dim=vocab_size_english, output_dim=embedding_dim, input_length=max_seq_len)(encoder_input)
encoder_rnn = SimpleRNN(128, return_state=True)
encoder_output, encoder_state = encoder_rnn(encoder_embedding)

# Define Decoder
decoder_input = Input(shape=(max_seq_len,))
decoder_embedding = Embedding(input_dim=vocab_size_german, output_dim=embedding_dim, input_length=max_seq_len)(decoder_input)
decoder_rnn = SimpleRNN(128, return_sequences=True, return_state=False)
decoder_output = decoder_rnn(decoder_embedding, initial_state=encoder_state)

# Define the output layer with TimeDistributed(Dense)
decoder_dense = TimeDistributed(Dense(vocab_size_german, activation='softmax'))
decoder_final_output = decoder_dense(decoder_output)

# Build the model
model = Model(inputs=[encoder_input, decoder_input], outputs=decoder_final_output)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()




In [10]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Ensure padding for the English and German sequences
X_train_english = pad_sequences(X_train_english, maxlen=max_seq_len, padding='post')
X_train_german = pad_sequences(X_train_german, maxlen=max_seq_len, padding='post')

# German target sequences should be shifted by one token (for teacher forcing)
y_train_german = X_train_german[:, 1:]  # Remove the first token
# The line below adds padding to the target sequence to make its length same as the input
y_train_german = pad_sequences(y_train_german, maxlen=max_seq_len, padding='post')
y_train_german = np.expand_dims(y_train_german, -1)  # Add extra dimension for sparse categorical crossentropy

In [None]:
model.fit([X_train_english, X_train_german], y_train_german, batch_size=512, epochs=10)


In [1]:
import torch
import torch.nn as nn

# Define the Encoder
class Encoder(nn.Module):
    def __init__(self, input_dim, hid_dim, n_layers):
        super(Encoder, self).__init__()
        self.rnn = nn.RNN(input_dim, hid_dim, n_layers, batch_first=True)

    def forward(self, src):
        # src: [batch size, src len, input_dim]
        outputs, hidden = self.rnn(src)
        # outputs: [batch size, src len, hid dim]
        # hidden: [n layers, batch size, hid dim]
        return hidden


# Define the Decoder
class Decoder(nn.Module):
    def __init__(self, output_dim, hid_dim, n_layers):
        super(Decoder, self).__init__()
        self.rnn = nn.RNN(output_dim, hid_dim, n_layers, batch_first=True)
        self.fc_out = nn.Linear(hid_dim, output_dim)

    def forward(self, input, hidden):
        # input: [batch size, 1, output_dim]
        # hidden: [n layers, batch size, hid dim]
        output, hidden = self.rnn(input, hidden)
        # output: [batch size, 1, hid dim]
        prediction = self.fc_out(output.squeeze(1))
        # prediction: [batch size, output_dim]
        return prediction, hidden


# Define the Seq2Seq Model
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        # src: [batch size, src len, input_dim]
        # trg: [batch size, trg len, output_dim]
        trg_len = trg.size(1)
        batch_size = src.size(0)
        output_dim = trg.size(2)

        outputs = torch.zeros(batch_size, trg_len, output_dim).to(self.device)

        # Encode the source sequence
        hidden = self.encoder(src)

        # First input to the decoder is the first target token
        input = trg[:, 0, :].unsqueeze(1)

        for t in range(1, trg_len):
            # Pass the input through the decoder
            output, hidden = self.decoder(input, hidden)
            outputs[:, t, :] = output

            # Decide whether to use teacher forcing
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            input = trg[:, t, :].unsqueeze(1) if teacher_force else output.unsqueeze(1)

        return outputs


# Model Hyperparameters
INPUT_DIM = 50   # Number of features in the input sequence
OUTPUT_DIM = 50  # Number of features in the output sequence
HID_DIM = 128    # Hidden state dimension
N_LAYERS = 2     # Number of RNN layers

# Instantiate the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
encoder = Encoder(INPUT_DIM, HID_DIM, N_LAYERS)
decoder = Decoder(OUTPUT_DIM, HID_DIM, N_LAYERS)
model = Seq2Seq(encoder, decoder, device).to(device)

# Print Model Summary
print(model)


Seq2Seq(
  (encoder): Encoder(
    (rnn): RNN(50, 128, num_layers=2, batch_first=True)
  )
  (decoder): Decoder(
    (rnn): RNN(50, 128, num_layers=2, batch_first=True)
    (fc_out): Linear(in_features=128, out_features=50, bias=True)
  )
)


In [2]:
from sklearn.model_selection import train_test_split

# Assuming `data` is your entire dataset as a NumPy array or pandas DataFrame
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

print(f"Train size: {len(train_data)}")
print(f"Validation size: {len(val_data)}")
print(f"Test size: {len(test_data)}")


NameError: name 'data' is not defined

In [3]:
from torch.utils.data import Dataset, DataLoader
import torch

class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data  # `data` is a list or array of (src, trg) pairs

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src, trg = self.data[idx]
        return torch.tensor(src, dtype=torch.float32), torch.tensor(trg, dtype=torch.float32)


In [4]:
BATCH_SIZE = 64

# Create dataset instances
train_dataset = TranslationDataset(train_data)
val_dataset = TranslationDataset(val_data)
test_dataset = TranslationDataset(test_data)

# Create DataLoader instances
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


NameError: name 'train_data' is not defined

In [5]:
def train_model(model, train_loader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0

    for src, trg in train_loader:
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()

        output = model(src, trg)  # Forward pass
        output_dim = output.shape[-1]
        trg = trg[:, 1:, :]  # Shift target sequence to match output
        output = output[:, :-1, :].contiguous().view(-1, output_dim)
        trg = trg.contiguous().view(-1, output_dim)

        loss = criterion(output, trg)  # Compute loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update parameters

        epoch_loss += loss.item()

    return epoch_loss / len(train_loader)


def evaluate_model(model, val_loader, criterion, device):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for src, trg in val_loader:
            src, trg = src.to(device), trg.to(device)
            output = model(src, trg, teacher_forcing_ratio=0.0)  # No teacher forcing
            output_dim = output.shape[-1]
            trg = trg[:, 1:, :]
            output = output[:, :-1, :].contiguous().view(-1, output_dim)
            trg = trg.contiguous().view(-1, output_dim)

            loss = criterion(output, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(val_loader)


In [6]:
# Hyperparameters
LEARNING_RATE = 1e-3
EPOCHS = 10

# Optimizer and Loss Function
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.MSELoss()  # Use CrossEntropyLoss for classification tasks

# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Training Loop
for epoch in range(EPOCHS):
    train_loss = train_model(model, train_loader, optimizer, criterion, device)
    val_loss = evaluate_model(model, val_loader, criterion, device)

    print(f"Epoch {epoch+1}/{EPOCHS}")
    print(f"Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")


NameError: name 'train_loader' is not defined

In [7]:
def test_model(model, test_loader, criterion, device):
    model.eval()
    test_loss = 0

    with torch.no_grad():
        for src, trg in test_loader:
            src, trg = src.to(device), trg.to(device)
            output = model(src, trg, teacher_forcing_ratio=0.0)
            output_dim = output.shape[-1]
            trg = trg[:, 1:, :]
            output = output[:, :-1, :].contiguous().view(-1, output_dim)
            trg = trg.contiguous().view(-1, output_dim)

            loss = criterion(output, trg)
            test_loss += loss.item()

    return test_loss / len(test_loader)

# Run the test
test_loss = test_model(model, test_loader, criterion, device)
print(f"Test Loss: {test_loss:.4f}")


NameError: name 'test_loader' is not defined