In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
Path = '/content/drive/MyDrive/Colab Notebooks/Capstone_VinayM/data/'

In [4]:
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import os
from nltk.stem import PorterStemmer

**Step 1: Import and merge all the three datasets**

In [5]:

# Load each English and German file
german_files = ["commoncrawl_de_en.txt", "europarl-v7_de_en.txt", "news-commentary-v9_de_en.txt"]
english_files = ["commoncrawl_en_de.txt", "europarl-v7_en_de.txt", "news-commentary-v9_en_de.txt"]

In [6]:

# Read English and German text files
with open(Path + 'commoncrawl_en_de.txt', 'r', encoding='utf-8') as eng_file:
    english_sentences = eng_file.readlines()

with open(Path + 'commoncrawl_de_en.txt', 'r', encoding='utf-8') as ger_file:
    german_sentences = ger_file.readlines()

# Strip any unnecessary whitespace (e.g., newline characters)
commoncrawl_english_sentences = [sentence.strip() for sentence in english_sentences]
commoncrawl_german_sentences = [sentence.strip() for sentence in german_sentences]

In [7]:
if len(commoncrawl_english_sentences) != len(commoncrawl_german_sentences):
    raise ValueError("The number of sentences in the English and German files do not match.")

In [8]:
commoncrawl_en_de_df = pd.DataFrame({
    'English': commoncrawl_english_sentences,
    'German': commoncrawl_german_sentences
})

In [9]:
commoncrawl_en_de_df.shape

(2399123, 2)

In [10]:
# Read English and German text files
with open(Path + 'europarl-v7_en_de.txt', 'r', encoding='utf-8') as eng_file:
    english_sentences = eng_file.readlines()

with open(Path + 'europarl-v7_de_en.txt', 'r', encoding='utf-8') as ger_file:
    german_sentences = ger_file.readlines()

# Strip any unnecessary whitespace (e.g., newline characters)
europarl_english_sentences = [sentence.strip() for sentence in english_sentences]
europarl_german_sentences = [sentence.strip() for sentence in german_sentences]

In [11]:
if len(europarl_english_sentences) != len(europarl_german_sentences):
    raise ValueError("The number of sentences in the English and German files do not match.")

In [12]:
europarl_en_de_df = pd.DataFrame({
    'English': europarl_english_sentences,
    'German': europarl_english_sentences
})

In [13]:
europarl_en_de_df.shape

(1920209, 2)

In [14]:
# Read English and German text files
# cleaning data for news comm in german
def is_special_characters(line):
    return re.match(r'^[^a-zA-Z0-9\s]+$', line) is not None

commentary_english_sentences = []
commentary_german_sentences = []

with open(Path + 'news-commentary-v9_en_de.txt', 'r', encoding='utf-8') as file:
  for line in file:
        cleaned_line = line.strip()

        if cleaned_line and not cleaned_line.isdigit() and not is_special_characters(cleaned_line):
            # Replace multiple spaces with a single space
            cleaned_line = re.sub(r'\s+', ' ', cleaned_line)
            commentary_english_sentences.append(cleaned_line)

with open(Path + 'news-commentary-v9_en_de.txt', 'r', encoding='utf-8') as file:
  for line in file:
        cleaned_line = line.strip()

        if cleaned_line and not cleaned_line.isdigit() and not is_special_characters(cleaned_line):
            # Replace multiple spaces with a single space
            cleaned_line = re.sub(r'\s+', ' ', cleaned_line)
            commentary_german_sentences.append(cleaned_line)

In [15]:
if len(commentary_english_sentences) != len(commentary_german_sentences):
    raise ValueError("The number of sentences in the English and German files do not match.")

In [16]:
commentary_en_de_df = pd.DataFrame({
    'English': commentary_english_sentences,
    'German': commentary_german_sentences
})

In [17]:
commentary_en_de_df.shape

(201553, 2)

In [18]:
final_df = pd.concat([commoncrawl_en_de_df, europarl_en_de_df, commentary_en_de_df], axis=0, ignore_index=True)

In [19]:
final_df.shape

(4520885, 2)

**Step 2: Data cleansing**

In [20]:
train_df = final_df.sample(frac=0.006, random_state=12)
hold_out_df = final_df.sample(frac=0.01, random_state=22)

In [21]:
final_df.shape

(4520885, 2)

In [22]:
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters and punctuation (except apostrophes in contractions)
    text = re.sub(r"[^a-zA-ZäöüßÄÖÜéèàùâêîôûç'\s]", '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning function to both columns
train_df['English'] = train_df['English'].apply(clean_text)
train_df['German'] = train_df['German'].apply(clean_text)

hold_out_df['English'] = hold_out_df['English'].apply(clean_text)
hold_out_df['German'] = hold_out_df['German'].apply(clean_text)

# Display cleaned DataFrame
print(train_df)

                                                   English  \
2067265           the counterpart appeared two years later   
152051             public parking is available metres away   
1684613  we guarantee the return of amount paid for the...   
2757674  that question was put to the citizens of switz...   
3663922  we welcome that because we the united states n...   
...                                                    ...   
63578    the staff was very nice and everything is goo ...   
2478570  however if we arrive in paris ten minutes late...   
46256    there's much to explore in this area and the m...   
3853028  we need to deal with increased globalisation a...   
2860401  mr president let me start by acknowledging the...   

                                                    German  
2067265         das gegenstück erscheint zwei jahre später  
152051   öffentliche parkmöglichkeiten befinden sich in...  
1684613  wir garantieren die zurückzahlung der für die ...  
2757674  th

**Step 3: NLP pre processing - Dataset suitable to be used for AIML model learning**

In [23]:
train_df = train_df.reset_index()
hold_out_df = hold_out_df.reset_index()

In [24]:
train_df.head()

Unnamed: 0,index,English,German
0,2067265,the counterpart appeared two years later,das gegenstück erscheint zwei jahre später
1,152051,public parking is available metres away,öffentliche parkmöglichkeiten befinden sich in...
2,1684613,we guarantee the return of amount paid for the...,wir garantieren die zurückzahlung der für die ...
3,2757674,that question was put to the citizens of switz...,that question was put to the citizens of switz...
4,3663922,we welcome that because we the united states n...,we welcome that because we the united states n...


In [25]:
import nltk
nltk.download('punkt_tab')

from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [26]:
nltk.download('stopwords')

from nltk.corpus import stopwords

german_stop_words = stopwords.words('german')
english_stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [27]:
chunk_size = 500000  # Adjust this based on available memory
max_seq_length = 50  # Adjust based on data analysis
output_dir = '/content/data1117/preprocessed_chunks_training/'
os.makedirs(output_dir, exist_ok=True)

# Initialize tokenizers
stemmer = PorterStemmer()
english_tokenizer = Tokenizer()
german_tokenizer = Tokenizer()

# Split the DataFrame into chunks manually
num_chunks = len(train_df) // chunk_size + (1 if len(train_df) % chunk_size != 0 else 0)

# Step 1: Fit Tokenizers Across Chunks
for i in range(num_chunks):
    print("num_chunk----->", i)
    chunk = train_df.iloc[i * chunk_size:(i + 1) * chunk_size]
    chunk['german_tokens'] = chunk['German'].apply(word_tokenize)
    chunk['english_tokens'] = chunk['English'].apply(word_tokenize)
    chunk['german_tokens'] = chunk['german_tokens'].apply(lambda tokens: [token for token in tokens if token not in german_stop_words])
    chunk['english_tokens'] = chunk['english_tokens'].apply(lambda tokens: [token for token in tokens if token not in english_stop_words])

    chunk['german_tokens'] = chunk['german_tokens'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])
    chunk['english_tokens'] = chunk['english_tokens'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])
    chunk['German'] = chunk['german_tokens'].apply(lambda tokens: ' '.join(tokens))
    chunk['English'] = chunk['english_tokens'].apply(lambda tokens: ' '.join(tokens))

    # Update tokenizers
    english_tokenizer.fit_on_texts(chunk['English'])
    german_tokenizer.fit_on_texts(chunk['German'])

    # Convert to sequences
    english_sequences = english_tokenizer.texts_to_sequences(chunk['English'])
    german_sequences = german_tokenizer.texts_to_sequences(chunk['German'])

    # Pad sequences
    english_padded = pad_sequences(english_sequences, maxlen=max_seq_length, padding='post')
    german_padded = pad_sequences(german_sequences, maxlen=max_seq_length, padding='post')

    # Save each chunk
    np.save(os.path.join(output_dir, f'english_chunk_{i}.npy'), english_padded)
    np.save(os.path.join(output_dir, f'german_chunk_{i}.npy'), german_padded)

print("Data processing in chunks completed. Tokenized sequences are saved to disk.")


num_chunk-----> 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['german_tokens'] = chunk['German'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['english_tokens'] = chunk['English'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['german_tokens'] = chunk['german_tokens'].apply(lambda tokens: [token for

Data processing in chunks completed. Tokenized sequences are saved to disk.


In [28]:
chunk_size = 500000  # Adjust this based on available memory
max_seq_length = 50  # Adjust based on data analysis
output_dir = '/content/data1117/preprocessed_chunks_holdout/'
os.makedirs(output_dir, exist_ok=True)

# Initialize tokenizers
stemmer = PorterStemmer()
english_tokenizer = Tokenizer()
german_tokenizer = Tokenizer()

# Split the DataFrame into chunks manually
num_chunks = len(hold_out_df) // chunk_size + (1 if len(hold_out_df) % chunk_size != 0 else 0)

# Step 1: Fit Tokenizers Across Chunks
for i in range(num_chunks):
    print("num_chunk----->", i)
    chunk = hold_out_df.iloc[i * chunk_size:(i + 1) * chunk_size]
    chunk['german_tokens'] = chunk['German'].apply(word_tokenize)
    chunk['english_tokens'] = chunk['English'].apply(word_tokenize)
    chunk['german_tokens'] = chunk['german_tokens'].apply(lambda tokens: [token for token in tokens if token not in german_stop_words])
    chunk['english_tokens'] = chunk['english_tokens'].apply(lambda tokens: [token for token in tokens if token not in english_stop_words])

    chunk['german_tokens'] = chunk['german_tokens'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])
    chunk['english_tokens'] = chunk['english_tokens'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])
    chunk['German'] = chunk['german_tokens'].apply(lambda tokens: ' '.join(tokens))
    chunk['English'] = chunk['english_tokens'].apply(lambda tokens: ' '.join(tokens))

    # Update tokenizers
    english_tokenizer.fit_on_texts(chunk['English'])
    german_tokenizer.fit_on_texts(chunk['German'])

    # Convert to sequences
    english_sequences = english_tokenizer.texts_to_sequences(chunk['English'])
    german_sequences = german_tokenizer.texts_to_sequences(chunk['German'])

    # Pad sequences
    english_padded = pad_sequences(english_sequences, maxlen=max_seq_length, padding='post')
    german_padded = pad_sequences(german_sequences, maxlen=max_seq_length, padding='post')

    # Save each chunk
    np.save(os.path.join(output_dir, f'english_chunk_{i}.npy'), english_padded)
    np.save(os.path.join(output_dir, f'german_chunk_{i}.npy'), german_padded)

print("Data processing in chunks completed. Tokenized sequences are saved to disk.")

num_chunk-----> 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['german_tokens'] = chunk['German'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['english_tokens'] = chunk['English'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['german_tokens'] = chunk['german_tokens'].apply(lambda tokens: [token for

Data processing in chunks completed. Tokenized sequences are saved to disk.


In [29]:
# Initialize lists to store all data
english_data = []
german_data = []
output_dir = '/content/data1117/preprocessed_chunks_training/'
# Iterate through all saved chunks
num_chunks = len([name for name in os.listdir(output_dir) if name.startswith('english_chunk_')])

for i in range(num_chunks):
    # Load the English and German chunks
    english_chunk = np.load(os.path.join(output_dir, f'english_chunk_{i}.npy'))
    german_chunk = np.load(os.path.join(output_dir, f'german_chunk_{i}.npy'))

    # Append to the list
    english_data.extend(english_chunk)
    german_data.extend(german_chunk)

# Convert lists to DataFrame
preprocessed_training_data = pd.DataFrame({
    'English': english_data,
    'German': german_data
})

print("Combined DataFrame created successfully.")
print(preprocessed_training_data.head())

Combined DataFrame created successfully.
                                             English  \
0  [3371, 556, 58, 17, 706, 0, 0, 0, 0, 0, 0, 0, ...   
1  [86, 378, 120, 1378, 442, 0, 0, 0, 0, 0, 0, 0,...   
2  [449, 543, 492, 1293, 31, 0, 0, 0, 0, 0, 0, 0,...   
3  [87, 178, 210, 1663, 0, 0, 0, 0, 0, 0, 0, 0, 0...   
4  [264, 200, 8, 18, 820, 2483, 1394, 107, 21, 14...   

                                              German  
0  [12720, 2338, 277, 131, 1308, 0, 0, 0, 0, 0, 0...  
1  [1501, 9944, 954, 305, 2263, 0, 0, 0, 0, 0, 0,...  
2  [3507, 12721, 226, 19194, 7197, 0, 0, 0, 0, 0,...  
3  [6, 132, 199, 3, 1, 182, 2, 6381, 0, 0, 0, 0, ...  
4  [10, 416, 6, 88, 10, 1, 290, 33, 53, 1086, 299...  


In [30]:
preprocessed_training_data.to_parquet('/content/drive/MyDrive/Colab Notebooks/Capstone_VinayM/data/capstone_preprocess_training.parquet')

In [31]:
# Initialize lists to store all data
english_data = []
german_data = []
output_dir = '/content/data1117/preprocessed_chunks_holdout/'
# Iterate through all saved chunks
num_chunks = len([name for name in os.listdir(output_dir) if name.startswith('english_chunk_')])

for i in range(num_chunks):
    # Load the English and German chunks
    english_chunk = np.load(os.path.join(output_dir, f'english_chunk_{i}.npy'))
    german_chunk = np.load(os.path.join(output_dir, f'german_chunk_{i}.npy'))

    # Append to the list
    english_data.extend(english_chunk)
    german_data.extend(german_chunk)

# Convert lists to DataFrame
preprocessed_holdout_data = pd.DataFrame({
    'English': english_data,
    'German': german_data
})

print("Combined DataFrame created successfully.")
print(preprocessed_holdout_data.head())

Combined DataFrame created successfully.
                                             English  \
0  [10, 40, 199, 415, 42, 508, 0, 0, 0, 0, 0, 0, ...   
1  [111, 2, 2432, 2111, 2557, 798, 1051, 1512, 20...   
2  [3630, 11267, 1035, 16200, 3398, 839, 3631, 16...   
3  [547, 56, 57, 22, 1302, 1292, 9013, 2745, 1473...   
4  [347, 738, 766, 2279, 635, 428, 311, 1941, 140...   

                                              German  
0  [1, 30, 23, 563, 3, 191, 1, 605, 2, 12, 88, 68...  
1  [5053, 7483, 6936, 9035, 10181, 1278, 0, 0, 0,...  
2  [664, 5054, 10182, 26879, 17785, 5659, 3905, 1...  
3  [13, 511, 6, 190, 139, 72, 1516, 48, 1615, 8, ...  
4  [1, 1429, 1, 2377, 4, 5660, 2, 1, 1343, 1, 422...  


In [32]:
preprocessed_holdout_data.to_parquet('/content/drive/MyDrive/Colab Notebooks/Capstone_VinayM/data/capstone_preprocess_holdout.parquet')

In [33]:
train_df = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/Capstone_VinayM/data/capstone_preprocess_training.parquet')
hold_out_df = pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/Capstone_VinayM/data/capstone_preprocess_holdout.parquet')

In [None]:
# Saving the LSTM model
torch.save(model.state_dict(), "/content/drive/MyDrive/Colab Notebooks/Capstone_VinayM/simple_lstm_model.pth")

In [None]:
loaded_model = SimpleLSTM(input_dim, output_dim, hidden_dim)
loaded_model.load_state_dict(torch.load("/content/drive/MyDrive/Colab Notebooks/Capstone_VinayM/simple_lstm_model.pth"))
loaded_model.to(device)

**Step 4: Design, train and test Bidirectional RNN**

In [40]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# Step 1: Define the Dataset Class
class SequenceDataset(Dataset):
    def __init__(self, df, input_col, target_col, input_max_len=None, target_max_len=None):
        self.input_sequences = self.pad_sequences(df[input_col].tolist(), max_length=input_max_len)
        self.target_sequences = self.pad_sequences(df[target_col].tolist(), max_length=target_max_len)
    def pad_sequences(self, sequences, max_length=None):
        valid_sequences = [seq for seq in sequences if len(seq) > 0]
        if max_length is None:
            max_length = max(len(seq) for seq in valid_sequences)
        return np.array(
            valid_sequences + [[0] * max_length for _ in range(len(sequences) - len(valid_sequences))],
            dtype=np.int32
        )

    def __len__(self):
        return len(self.input_sequences)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.input_sequences[idx], dtype=torch.long),
            torch.tensor(self.target_sequences[idx], dtype=torch.long),
        )

# Load and preprocess data
# Assume df is a Pandas DataFrame with "English" and "German" columns
input_max_len = 50  # Set maximum sequence lengths
target_max_len = 50
train_dataset = SequenceDataset(train_df, input_col="English", target_col="German", input_max_len=input_max_len, target_max_len=target_max_len)
hold_out_dataset = SequenceDataset(hold_out_df, input_col="English", target_col="German", input_max_len=input_max_len, target_max_len=target_max_len)

# DataLoader
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
holdout_dataloader = DataLoader(hold_out_dataset, batch_size=batch_size, shuffle=True)

# Step 2: Define the Bidirectional RNN Model
class BidirectionalRNN(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, output_dim):
        super(BidirectionalRNN, self).__init__()
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Multiply by 2 for bidirectional

    def forward(self, x):
        embedded = self.embedding(x)  # Convert token indices to embeddings
        output, _ = self.rnn(embedded)  #
        output = self.fc(output)  # Fully connected layer
        return output



# Model parameters
input_dim = train_df["English"].apply(lambda x: max(x)).max() + 1  # Vocabulary size of English
output_dim = train_df["German"].apply(lambda x: max(x)).max() + 1  # Vocabulary size of German
embed_dim = 128  # Embedding dimension
hidden_dim = 256  # Hidden state dimension
batch_size = 32
num_epochs = 20
learning_rate = 0.001

# Initialize model, loss, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BidirectionalRNN(input_dim, embed_dim, hidden_dim, output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for inputs, targets in train_dataloader:
        inputs, targets = inputs.to(device), targets.to(device)

        # Forward pass
        outputs = model(inputs)

        # Reshape outputs and targets for loss computation
        outputs = outputs.view(-1, output_dim)
        targets = targets.view(-1)

        # Compute loss
        loss = criterion(outputs, targets)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(train_dataloader)}")

# Evaluation function
# def evaluate_model(model, dataloader):
#     model.eval()
#     total_loss = 0
#     with torch.no_grad():
#         for inputs, targets in dataloader:
#             inputs, targets = inputs.to(device), targets.to(device)

#             outputs = model(inputs)
#             outputs = outputs.view(-1, output_dim)
#             targets = targets.view(-1)

#             loss = criterion(outputs, targets)
#             total_loss += loss.item()
#     return total_loss / len(dataloader)

# # Test the model (using train data as test data for simplicity)
# test_loss = evaluate_model(model, train_dataloader)
# print(f"Test Loss: {test_loss}")

Epoch 1/20, Loss: 3.101881661715935
Epoch 2/20, Loss: 2.6308111647530548
Epoch 3/20, Loss: 2.2397344134185673
Epoch 4/20, Loss: 1.942171069494677
Epoch 5/20, Loss: 1.7885547555491048
Epoch 6/20, Loss: 1.6670257261128358
Epoch 7/20, Loss: 1.58733650904922
Epoch 8/20, Loss: 1.5139870382845402
Epoch 9/20, Loss: 1.4414301098869093
Epoch 10/20, Loss: 1.4028230209676724
Epoch 11/20, Loss: 1.3475716442642909
Epoch 12/20, Loss: 1.3192388594923716
Epoch 13/20, Loss: 1.2920569619339592
Epoch 14/20, Loss: 1.2769018903831548
Epoch 15/20, Loss: 1.2354355149392813
Epoch 16/20, Loss: 1.2193441656154562
Epoch 17/20, Loss: 1.1917094620612432
Epoch 18/20, Loss: 1.2015229151672069
Epoch 19/20, Loss: 1.1768935860644252
Epoch 20/20, Loss: 1.1766289272609185


In [41]:
input_dim = train_df["English"].apply(lambda x: max(x)).max() + 1  # Vocabulary size of English
model.eval()
with torch.no_grad():
    for inputs, targets in train_dataloader:
        inputs = inputs.to(device) # Move inputs to the correct device
        # Removed one-hot encoding - the embedding layer expects integer indices
        outputs = model(inputs)
        print("Sample prediction:", torch.argmax(outputs, dim=2)[0].cpu().numpy())
        break

Sample prediction: [  257 49608  7335  1673  3966 49609  1886   230     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]


In [42]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Define a function to compute BLEU score
def compute_bleu_score(reference, hypothesis):
    """
    Compute BLEU score for a single pair of reference and hypothesis sequences.
    :param reference: List of integers (ground truth sequence)
    :param hypothesis: List of integers (predicted sequence)
    :return: BLEU score
    """
    smoothing = SmoothingFunction().method1
    return sentence_bleu([reference], hypothesis, smoothing_function=smoothing)

# Evaluate BLEU score on a batch
model.eval()
total_bleu = 0
count = 0

with torch.no_grad():
    for inputs, targets in train_dataloader:
        # inputs_one_hot = torch.nn.functional.one_hot(inputs, num_classes=input_dim).float().to(device)
        inputs = inputs.to(device) # Move the inputs to the appropriate device
        targets = targets.to(device)

        # Forward pass
        outputs = model(inputs) # Pass the original inputs (integer indices)

        # Get the predicted sequences
        predictions = torch.argmax(outputs, dim=2)  # Shape: [batch_size, seq_length]

        # Compute BLEU for each sequence in the batch
        for i in range(predictions.size(0)):
            reference = targets[i].cpu().tolist()
            hypothesis = predictions[i].cpu().tolist()

            # Remove padding tokens (0s)
            reference = [token for token in reference if token != 0]
            hypothesis = [token for token in hypothesis if token != 0]

            bleu_score = compute_bleu_score(reference, hypothesis)
            total_bleu += bleu_score
            count += 1

# Compute the average BLEU score for the dataset
average_bleu = total_bleu / count
print(f"Average BLEU Score: {average_bleu:.4f}")

Average BLEU Score: 0.4147


In [43]:
# Saving the Bidirectional RNN model
torch.save(model.state_dict(), "/content/drive/MyDrive/Colab Notebooks/Capstone_VinayM/Bidirectional_RNN_model.pth")

In [44]:
loaded_model = BidirectionalRNN(input_dim, embed_dim, hidden_dim, output_dim)
loaded_model.load_state_dict(torch.load("/content/drive/MyDrive/Colab Notebooks/Capstone_VinayM/Bidirectional_RNN_model.pth"))
loaded_model.to(device)

  loaded_model.load_state_dict(torch.load("/content/drive/MyDrive/Colab Notebooks/Capstone_VinayM/Bidirectional_RNN_model.pth"))


BidirectionalRNN(
  (embedding): Embedding(30507, 128)
  (rnn): RNN(128, 256, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=53853, bias=True)
)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))

    def forward(self, hidden, encoder_outputs):
        energy = torch.tanh(self.attn(encoder_outputs))  # (batch_size, seq_len, hidden_size)
        energy = energy.matmul(self.v)  # (batch_size, seq_len)
        attention_weights = torch.softmax(energy, dim=1)  # (batch_size, seq_len)
        return attention_weights

class BiRNNWithAttention(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size):
        super(BiRNNWithAttention, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True, bidirectional=True, num_layers=2, dropout=0.5)
        self.attention = Attention(hidden_size)
        self.fc = nn.Linear(hidden_size * 2, output_size)

    def forward(self, x):
        x = self.embedding(x)
        rnn_out, _ = self.rnn(x)
        attention_weights = self.attention(rnn_out, rnn_out)
        context_vector = attention_weights.unsqueeze(1).bmm(rnn_out)  # weighted sum
        out = self.fc(context_vector.squeeze(1))  # Final prediction
        return out

# Hyperparameters
vocab_size = 1000
embedding_dim = 50
hidden_size = 256
output_size = 1
batch_size = 32
sequence_length = 10

# Model and optimizer
model = BiRNNWithAttention(vocab_size, embedding_dim, hidden_size, output_size)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop (simplified)
for epoch in range(100):
    outputs = model(X_train)
    loss = criterion(outputs, y_train)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

In [37]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset


# Step 1: Define the Dataset Class
class SequenceDataset(Dataset):
    def __init__(self, df, input_col, target_col, input_max_len=None, target_max_len=None):
        self.input_sequences = self.pad_sequences(df[input_col].tolist(), max_length=input_max_len)
        self.target_sequences = self.pad_sequences(df[target_col].tolist(), max_length=target_max_len)
    def pad_sequences(self, sequences, max_length=None):
        valid_sequences = [seq for seq in sequences if len(seq) > 0]
        if max_length is None:
            max_length = max(len(seq) for seq in valid_sequences)
        return np.array(
            valid_sequences + [[0] * max_length for _ in range(len(sequences) - len(valid_sequences))],
            dtype=np.int32
        )

    def __len__(self):
        return len(self.input_sequences)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.input_sequences[idx], dtype=torch.long),
            torch.tensor(self.target_sequences[idx], dtype=torch.long),
        )

# Load and preprocess data
# Assume df is a Pandas DataFrame with "English" and "German" columns
input_max_len = 50  # Set maximum sequence lengths
target_max_len = 50
train_dataset = SequenceDataset(train_df, input_col="English", target_col="German", input_max_len=input_max_len, target_max_len=target_max_len)
hold_out_dataset = SequenceDataset(hold_out_df, input_col="English", target_col="German", input_max_len=input_max_len, target_max_len=target_max_len)

# DataLoader
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
holdout_dataloader = DataLoader(hold_out_dataset, batch_size=batch_size, shuffle=True)

class BidirectionalLSTM(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, output_dim):
        super(BidirectionalLSTM, self).__init__()
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Multiply by 2 for bidirectional

    def forward(self, x):
        embedded = self.embedding(x)  # Convert token indices to embeddings
        output, _ = self.lstm(embedded)  # LSTM forward pass
        output = self.fc(output)  # Fully connected layer
        return output

# Model parameters
input_dim = train_df["English"].apply(lambda x: max(x)).max() + 1  # Vocabulary size of English
output_dim = train_df["German"].apply(lambda x: max(x)).max() + 1  # Vocabulary size of German
embed_dim = 128
hidden_dim = 256
batch_size = 32
num_epochs = 20
learning_rate = 0.001

# Initialize model, loss, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = BidirectionalLSTM(input_dim, embed_dim, hidden_dim, output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Step 3: Training Loop
#num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for inputs, targets in train_dataloader:
        inputs = inputs.to(device)
        targets = targets.to(device)

        # Forward pass
        outputs = model(inputs)

        # Reshape outputs and targets for loss computation
        outputs = outputs.view(-1, output_dim)  # Flatten for CrossEntropyLoss
        targets = targets.view(-1)  # Flatten targets

        # Compute loss
        loss = criterion(outputs, targets)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(train_dataloader)}")

# # Step 4: Evaluation
# def evaluate_model(model, dataloader):
#     model.eval()
#     total_loss = 0
#     with torch.no_grad():
#         for inputs, targets in dataloader:
#             inputs = inputs.to(device)
#             targets = targets.to(device)

#             outputs = model(inputs)
#             outputs = outputs.view(-1, output_dim)
#             targets = targets.view(-1)

#             loss = criterion(outputs, targets)
#             total_loss += loss.item()
#     return total_loss / len(dataloader)

# # Test the model (using train data as test data for simplicity here)
# test_loss = evaluate_model(model, train_dataloader)
# print(f"Test Loss: {test_loss}")


cuda
Epoch 1/20, Loss: 2.8836493992580556
Epoch 2/20, Loss: 2.352052436614374
Epoch 3/20, Loss: 1.8609018999872342
Epoch 4/20, Loss: 1.4628527197846264
Epoch 5/20, Loss: 1.2348058274332083
Epoch 6/20, Loss: 1.093099350619288
Epoch 7/20, Loss: 0.9929564326193254
Epoch 8/20, Loss: 0.9148583354709564
Epoch 9/20, Loss: 0.8491544980590917
Epoch 10/20, Loss: 0.79494068612573
Epoch 11/20, Loss: 0.7493307876646659
Epoch 12/20, Loss: 0.7089088592977034
Epoch 13/20, Loss: 0.6756446088646662
Epoch 14/20, Loss: 0.645709201140013
Epoch 15/20, Loss: 0.620196433175566
Epoch 16/20, Loss: 0.5965943441216676
Epoch 17/20, Loss: 0.5755609249309549
Epoch 18/20, Loss: 0.555085060590843
Epoch 19/20, Loss: 0.5370818546419647
Epoch 20/20, Loss: 0.5233660059155159


In [38]:
input_dim = train_df["English"].apply(lambda x: max(x)).max() + 1  # Vocabulary size of English
model.eval()
with torch.no_grad():
    for inputs, targets in train_dataloader:
        inputs = inputs.to(device) # Move inputs to the correct device
        # Removed one-hot encoding - the embedding layer expects integer indices
        outputs = model(inputs)
        print("Sample prediction:", torch.argmax(outputs, dim=2)[0].cpu().numpy())
        break

Sample prediction: [52240  4653 17732 52241 52242  3769   555 52243  1349  1697  7935     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]


In [39]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Define a function to compute BLEU score
def compute_bleu_score(reference, hypothesis):
    """
    Compute BLEU score for a single pair of reference and hypothesis sequences.
    :param reference: List of integers (ground truth sequence)
    :param hypothesis: List of integers (predicted sequence)
    :return: BLEU score
    """
    smoothing = SmoothingFunction().method1
    return sentence_bleu([reference], hypothesis, smoothing_function=smoothing)

# Evaluate BLEU score on a batch
model.eval()
total_bleu = 0
count = 0

with torch.no_grad():
    for inputs, targets in train_dataloader:
        # inputs_one_hot = torch.nn.functional.one_hot(inputs, num_classes=input_dim).float().to(device)
        inputs = inputs.to(device) # Move the inputs to the appropriate device
        targets = targets.to(device)

        # Forward pass
        outputs = model(inputs) # Pass the original inputs (integer indices)

        # Get the predicted sequences
        predictions = torch.argmax(outputs, dim=2)  # Shape: [batch_size, seq_length]

        # Compute BLEU for each sequence in the batch
        for i in range(predictions.size(0)):
            reference = targets[i].cpu().tolist()
            hypothesis = predictions[i].cpu().tolist()

            # Remove padding tokens (0s)
            reference = [token for token in reference if token != 0]
            hypothesis = [token for token in hypothesis if token != 0]

            bleu_score = compute_bleu_score(reference, hypothesis)
            total_bleu += bleu_score
            count += 1

# Compute the average BLEU score for the dataset
average_bleu = total_bleu / count
print(f"Average BLEU Score: {average_bleu:.4f}")

Average BLEU Score: 0.6180


In [None]:
# Saving the Bidirectional LSTM model
torch.save(model.state_dict(), "/content/drive/MyDrive/Colab Notebooks/Capstone_VinayM/Bidirectional_LSTM_model.pth")

In [None]:
loaded_model = BidirectionalRNN(input_dim, embed_dim, hidden_dim, output_dim)
loaded_model.load_state_dict(torch.load("/content/drive/MyDrive/Colab Notebooks/Capstone_VinayM/Bidirectional_LSTM_model.pth"))
loaded_model.to(device)