In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
Path = '/content/drive/MyDrive/MLT+Dataset/Dataset/'

In [3]:
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import os
from nltk.stem import PorterStemmer

**Step 1: Import and merge all the three datasets**

In [4]:

# Load each English and German file
german_files = ["commoncrawl_de_en.txt", "europarl-v7_de_en.txt", "news-commentary-v9_de_en.txt"]
english_files = ["commoncrawl_en_de.txt", "europarl-v7_en_de.txt", "news-commentary-v9_en_de.txt"]

In [5]:

# Read English and German text files
with open(Path + 'commoncrawl_en_de.txt', 'r', encoding='utf-8') as eng_file:
    english_sentences = eng_file.readlines()

with open(Path + 'commoncrawl_de_en.txt', 'r', encoding='utf-8') as ger_file:
    german_sentences = ger_file.readlines()

# Strip any unnecessary whitespace (e.g., newline characters)
commoncrawl_english_sentences = [sentence.strip() for sentence in english_sentences]
commoncrawl_german_sentences = [sentence.strip() for sentence in german_sentences]

In [6]:
if len(commoncrawl_english_sentences) != len(commoncrawl_german_sentences):
    raise ValueError("The number of sentences in the English and German files do not match.")

In [7]:
commoncrawl_en_de_df = pd.DataFrame({
    'English': commoncrawl_english_sentences,
    'German': commoncrawl_german_sentences
})

In [8]:
commoncrawl_en_de_df.shape

(2399123, 2)

In [9]:
# Read English and German text files
with open(Path + 'europarl-v7_en_de.txt', 'r', encoding='utf-8') as eng_file:
    english_sentences = eng_file.readlines()

with open(Path + 'europarl-v7_de_en.txt', 'r', encoding='utf-8') as ger_file:
    german_sentences = ger_file.readlines()

# Strip any unnecessary whitespace (e.g., newline characters)
europarl_english_sentences = [sentence.strip() for sentence in english_sentences]
europarl_german_sentences = [sentence.strip() for sentence in german_sentences]

In [10]:
if len(europarl_english_sentences) != len(europarl_german_sentences):
    raise ValueError("The number of sentences in the English and German files do not match.")

In [11]:

europarl_en_de_df = pd.DataFrame({
    'English': europarl_english_sentences,
    'German': europarl_english_sentences
})

In [12]:
europarl_en_de_df.shape

(1920209, 2)

In [13]:
# Read English and German text files
# cleaning data for news comm in german
def is_special_characters(line):
    return re.match(r'^[^a-zA-Z0-9\s]+$', line) is not None

commentary_english_sentences = []
commentary_german_sentences = []

with open(Path + 'news-commentary-v9_en_de.txt', 'r', encoding='utf-8') as file:
  for line in file:
        cleaned_line = line.strip()

        if cleaned_line and not cleaned_line.isdigit() and not is_special_characters(cleaned_line):
            # Replace multiple spaces with a single space
            cleaned_line = re.sub(r'\s+', ' ', cleaned_line)
            commentary_english_sentences.append(cleaned_line)

with open(Path + 'news-commentary-v9_en_de.txt', 'r', encoding='utf-8') as file:
  for line in file:
        cleaned_line = line.strip()

        if cleaned_line and not cleaned_line.isdigit() and not is_special_characters(cleaned_line):
            # Replace multiple spaces with a single space
            cleaned_line = re.sub(r'\s+', ' ', cleaned_line)
            commentary_german_sentences.append(cleaned_line)

In [14]:
if len(commentary_english_sentences) != len(commentary_german_sentences):
    raise ValueError("The number of sentences in the English and German files do not match.")

In [15]:
commentary_en_de_df = pd.DataFrame({
    'English': commentary_english_sentences,
    'German': commentary_german_sentences
})

In [16]:
commentary_en_de_df.shape

(201553, 2)

In [17]:
final_df = pd.concat([commoncrawl_en_de_df, europarl_en_de_df, commentary_en_de_df], axis=0, ignore_index=True)

In [18]:
final_df.shape

(4520885, 2)

**Step 2: Data cleansing**

In [52]:
train_df = final_df.sample(frac=0.007, random_state=12)

In [53]:
final_df.shape

(4520885, 2)

In [54]:
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters and punctuation (except apostrophes in contractions)
    text = re.sub(r"[^a-zA-ZäöüßÄÖÜéèàùâêîôûç'\s]", '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning function to both columns
train_df['English'] = train_df['English'].apply(clean_text)
train_df['German'] = train_df['German'].apply(clean_text)

# Display cleaned DataFrame
print(train_df)

                                                   English  \
2067265           the counterpart appeared two years later   
152051             public parking is available metres away   
1684613  we guarantee the return of amount paid for the...   
2757674  that question was put to the citizens of switz...   
3663922  we welcome that because we the united states n...   
...                                                    ...   
217733   the bedrooms are very small but are very clean...   
2058614  you will reach behy and the camp before youd c...   
4438570  santa claus was a turkish dervish who in the m...   
21856    always prepared with the freshest ingredients ...   
1571836  toolbar contains shortcuts to frequently used ...   

                                                    German  
2067265         das gegenstück erscheint zwei jahre später  
152051   öffentliche parkmöglichkeiten befinden sich in...  
1684613  wir garantieren die zurückzahlung der für die ...  
2757674  th

**Step 3: NLP pre processing - Dataset suitable to be used for AIML model learning**

In [55]:
train_df = train_df.reset_index()

In [56]:
train_df.head()

Unnamed: 0,index,English,German
0,2067265,the counterpart appeared two years later,das gegenstück erscheint zwei jahre später
1,152051,public parking is available metres away,öffentliche parkmöglichkeiten befinden sich in...
2,1684613,we guarantee the return of amount paid for the...,wir garantieren die zurückzahlung der für die ...
3,2757674,that question was put to the citizens of switz...,that question was put to the citizens of switz...
4,3663922,we welcome that because we the united states n...,we welcome that because we the united states n...


In [57]:
train_df.shape

(31646, 3)

In [58]:
import nltk
nltk.download('punkt_tab')

from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [59]:
nltk.download('stopwords')

from nltk.corpus import stopwords

german_stop_words = stopwords.words('german')
english_stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [60]:
chunk_size = 500000  # Adjust this based on available memory
max_seq_length = 50  # Adjust based on data analysis
output_dir = '/content/data1117/preprocessed_chunks_training/'
os.makedirs(output_dir, exist_ok=True)

# Initialize tokenizers
stemmer = PorterStemmer()
english_tokenizer = Tokenizer()
german_tokenizer = Tokenizer()

# Split the DataFrame into chunks manually
num_chunks = len(train_df) // chunk_size + (1 if len(train_df) % chunk_size != 0 else 0)

# Step 1: Fit Tokenizers Across Chunks
for i in range(num_chunks):
    print("num_chunk----->", i)
    chunk = train_df.iloc[i * chunk_size:(i + 1) * chunk_size]
    chunk['german_tokens'] = chunk['German'].apply(word_tokenize)
    chunk['english_tokens'] = chunk['English'].apply(word_tokenize)
    chunk['german_tokens'] = chunk['german_tokens'].apply(lambda tokens: [token for token in tokens if token not in german_stop_words])
    chunk['english_tokens'] = chunk['english_tokens'].apply(lambda tokens: [token for token in tokens if token not in english_stop_words])

    chunk['german_tokens'] = chunk['german_tokens'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])
    chunk['english_tokens'] = chunk['english_tokens'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])
    chunk['German'] = chunk['german_tokens'].apply(lambda tokens: ' '.join(tokens))
    chunk['English'] = chunk['english_tokens'].apply(lambda tokens: ' '.join(tokens))

    # Update tokenizers
    english_tokenizer.fit_on_texts(chunk['English'])
    german_tokenizer.fit_on_texts(chunk['German'])

    # Convert to sequences
    english_sequences = english_tokenizer.texts_to_sequences(chunk['English'])
    german_sequences = german_tokenizer.texts_to_sequences(chunk['German'])

    # Pad sequences
    english_padded = pad_sequences(english_sequences, maxlen=max_seq_length, padding='post')
    german_padded = pad_sequences(german_sequences, maxlen=max_seq_length, padding='post')

    # Save each chunk
    np.save(os.path.join(output_dir, f'english_chunk_{i}.npy'), english_padded)
    np.save(os.path.join(output_dir, f'german_chunk_{i}.npy'), german_padded)

print("Data processing in chunks completed. Tokenized sequences are saved to disk.")


num_chunk-----> 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['german_tokens'] = chunk['German'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['english_tokens'] = chunk['English'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['german_tokens'] = chunk['german_tokens'].apply(lambda tokens: [token for

Data processing in chunks completed. Tokenized sequences are saved to disk.


In [61]:
# Initialize lists to store all data
english_data = []
german_data = []
output_dir = '/content/data1117/preprocessed_chunks_training/'
# Iterate through all saved chunks
num_chunks = len([name for name in os.listdir(output_dir) if name.startswith('english_chunk_')])

for i in range(num_chunks):
    # Load the English and German chunks
    english_chunk = np.load(os.path.join(output_dir, f'english_chunk_{i}.npy'))
    german_chunk = np.load(os.path.join(output_dir, f'german_chunk_{i}.npy'))

    # Append to the list
    english_data.extend(english_chunk)
    german_data.extend(german_chunk)

# Convert lists to DataFrame
preprocessed_training_data = pd.DataFrame({
    'English': english_data,
    'German': german_data
})

print("Combined DataFrame created successfully.")
print(preprocessed_training_data.head())

Combined DataFrame created successfully.
                                             English  \
0  [3683, 573, 61, 18, 710, 0, 0, 0, 0, 0, 0, 0, ...   
1  [88, 391, 115, 1314, 451, 0, 0, 0, 0, 0, 0, 0,...   
2  [429, 552, 507, 1376, 30, 0, 0, 0, 0, 0, 0, 0,...   
3  [80, 178, 223, 1692, 0, 0, 0, 0, 0, 0, 0, 0, 0...   
4  [276, 197, 8, 17, 801, 2641, 1406, 102, 20, 14...   

                                              German  
0  [14086, 2496, 265, 129, 1091, 0, 0, 0, 0, 0, 0...  
1  [1478, 11006, 946, 309, 2497, 0, 0, 0, 0, 0, 0...  
2  [3409, 14087, 235, 21266, 6471, 0, 0, 0, 0, 0,...  
3  [6, 127, 196, 3, 1, 191, 2, 7122, 0, 0, 0, 0, ...  
4  [10, 417, 6, 90, 10, 1, 282, 33, 51, 990, 3410...  


In [62]:
preprocessed_training_data.to_parquet('/content/drive/MyDrive/capstone_preprocess_training.parquet')

**Step 4: Design, train and test simple RNN**

In [63]:
train_df = pd.read_parquet('/content/drive/MyDrive/capstone_preprocess_training.parquet')

In [64]:
train_df.shape

(31646, 2)

In [65]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np

# Step 1: Custom Dataset Class
class SequenceDataset(Dataset):
    def __init__(self, df, input_col, target_col, input_max_len=None, target_max_len=None):
        self.input_sequences = self.pad_sequences(df[input_col].tolist(), max_length=input_max_len)
        self.target_sequences = self.pad_sequences(df[target_col].tolist(), max_length=target_max_len)

    def pad_sequences(self, sequences, max_length=None):
        valid_sequences = [seq for seq in sequences if len(seq) > 0]
        if max_length is None:
            max_length = max(len(seq) for seq in valid_sequences)
        return np.array(
            valid_sequences, dtype=np.int64
        )

    def __len__(self):
        return len(self.input_sequences)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.input_sequences[idx], dtype=torch.long),
            torch.tensor(self.target_sequences[idx], dtype=torch.long),
        )

# Load and preprocess data
# Assume df is a Pandas DataFrame with "English" and "German" columns
input_max_len = 50  # Set maximum sequence lengths
target_max_len = 50
train_dataset = SequenceDataset(train_df, input_col="English", target_col="German", input_max_len=input_max_len, target_max_len=target_max_len)
# hold_out_dataset = SequenceDataset(hold_out_df, input_col="English", target_col="German", input_max_len=input_max_len, target_max_len=target_max_len)

# DataLoader
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)




In [69]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np


# Step 2: Define RNN Model with Embeddings
class RNNWithEmbedding(nn.Module):
    def __init__(self, input_dim, output_dim, embed_dim, hidden_dim, padding_idx):
        super(RNNWithEmbedding, self).__init__()
        self.embedding = nn.Embedding(input_dim, embed_dim, padding_idx=padding_idx)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)  # Embed input tokens
        output, _ = self.rnn(embedded)  # RNN layer
        output = self.fc(output)  # Fully connected layer
        return output

# Model parameters
input_vocab_size = train_df["English"].apply(lambda x: max(x)).max() + 1
output_vocab_size = train_df["German"].apply(lambda x: max(x)).max() + 1
embed_dim = 128  # Dimension of word embeddings
hidden_dim = 128  # Hidden state dimension
padding_idx = 0  # Token for padding

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RNNWithEmbedding(input_vocab_size, output_vocab_size, embed_dim, hidden_dim, padding_idx).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=padding_idx)  # Ignore padding in loss calculation
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Step 3: Training Loop
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for inputs, targets in train_dataloader:
        inputs, targets = inputs.to(device), targets.to(device)

        # Forward pass
        outputs = model(inputs)

        # Reshape for loss computation
        outputs = outputs.view(-1, output_vocab_size)
        targets = targets.view(-1)

        loss = criterion(outputs, targets)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(train_dataloader)}")




Epoch 1/50, Loss: 8.19080402612445
Epoch 2/50, Loss: 7.398646698441616
Epoch 3/50, Loss: 6.871127482492352
Epoch 4/50, Loss: 6.437763185665027
Epoch 5/50, Loss: 6.07901947177698
Epoch 6/50, Loss: 5.769907738248788
Epoch 7/50, Loss: 5.533795343009478
Epoch 8/50, Loss: 5.395364112391149
Epoch 9/50, Loss: 5.210550840631414
Epoch 10/50, Loss: 5.098154233849567
Epoch 11/50, Loss: 4.975869658984116
Epoch 12/50, Loss: 4.967288739037586
Epoch 13/50, Loss: 4.887492290039757
Epoch 14/50, Loss: 4.948541448258534
Epoch 15/50, Loss: 4.762877324473629
Epoch 16/50, Loss: 4.775150833886844
Epoch 17/50, Loss: 4.724256018054256
Epoch 18/50, Loss: 4.821493205455244
Epoch 19/50, Loss: 4.7493548381196
Epoch 20/50, Loss: 4.637461034786352
Epoch 21/50, Loss: 4.638160916623501
Epoch 22/50, Loss: 4.643048938051153
Epoch 23/50, Loss: 4.6637857933015505
Epoch 24/50, Loss: 4.687742391056913
Epoch 25/50, Loss: 4.825265346571457
Epoch 26/50, Loss: 4.627383795980255
Epoch 27/50, Loss: 4.612835032151611
Epoch 28/50, 

In [70]:
# Step 4: Testing and Evaluation
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction


model.eval()
bleu_scores = []
with torch.no_grad():
    for inputs, targets in train_dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        predictions = torch.argmax(outputs, dim=2)

    smoothing = SmoothingFunction()
    for pred, target in zip(predictions.cpu().numpy(), targets.cpu().numpy()):
        pred_tokens = [token for token in pred if token != padding_idx]
        target_tokens = [token for token in target if token != padding_idx]
        bleu_scores.append(sentence_bleu([target_tokens], pred_tokens, smoothing_function=smoothing.method1))

print(f"Average BLEU Score with Smoothing: {np.mean(bleu_scores)}")


Average BLEU Score with Smoothing: 0.035434258592088964


In [66]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Step 3: Define the LSTM Model with Embedding
class LSTMWithEmbedding(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, embedding_dim, hidden_dim):
        super(LSTMWithEmbedding, self).__init__()
        self.embedding = nn.Embedding(input_vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)  # Convert token IDs to embeddings
        output, (hidden, cell) = self.lstm(embedded)  # LSTM forward pass
        output = self.fc(output)  # Map to output vocabulary size
        return output

# Model parameters
input_vocab_size = train_df["English"].apply(lambda x: max(x)).max() + 1  # Vocabulary size of English
output_vocab_size = train_df["German"].apply(lambda x: max(x)).max() + 1  # Vocabulary size of German
embedding_dim = 64
hidden_dim = 128

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Initialize model, loss, and optimizer
model = LSTMWithEmbedding(input_vocab_size, output_vocab_size, embedding_dim, hidden_dim).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding index
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Step 4: Training Loop
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for inputs, targets in train_dataloader:
        inputs, targets = inputs.to(device), targets.to(device)

        # Forward pass
        outputs = model(inputs)

        # Reshape outputs and targets for loss computation
        outputs = outputs.view(-1, output_vocab_size)  # Flatten for CrossEntropyLoss
        targets = targets.view(-1)  # Flatten targets

        # Compute loss
        loss = criterion(outputs, targets)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(train_dataloader)}")




cuda
Epoch 1/50, Loss: 8.129580169162809
Epoch 2/50, Loss: 7.195653106373169
Epoch 3/50, Loss: 6.431390816087791
Epoch 4/50, Loss: 5.875089645867883
Epoch 5/50, Loss: 5.434603942778523
Epoch 6/50, Loss: 5.065609554911768
Epoch 7/50, Loss: 4.756647158922633
Epoch 8/50, Loss: 4.504145090813825
Epoch 9/50, Loss: 4.297210356222486
Epoch 10/50, Loss: 4.135872913564059
Epoch 11/50, Loss: 3.9996648069821426
Epoch 12/50, Loss: 3.887211584345757
Epoch 13/50, Loss: 3.7855895940888638
Epoch 14/50, Loss: 3.7012505068455956
Epoch 15/50, Loss: 3.627193490342737
Epoch 16/50, Loss: 3.5520575821580493
Epoch 17/50, Loss: 3.4912791008655173
Epoch 18/50, Loss: 3.430997374445893
Epoch 19/50, Loss: 3.3818515721177427
Epoch 20/50, Loss: 3.329871272655059
Epoch 21/50, Loss: 3.2868714740592138
Epoch 22/50, Loss: 3.2454835347627844
Epoch 23/50, Loss: 3.2104130215543587
Epoch 24/50, Loss: 3.1719611977783564
Epoch 25/50, Loss: 3.1377771199653557
Epoch 26/50, Loss: 3.1087083269901297
Epoch 27/50, Loss: 3.075891734

In [68]:
# Step 5: Evaluation
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
model.eval()
with torch.no_grad():
    total_bleu_score = 0
    for inputs, targets in train_dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)

        # Convert logits to predictions
        predictions = torch.argmax(outputs, dim=-1)

        # Compute BLEU Score
        bleu_scores = []
        smoothing = SmoothingFunction()
        for pred, target in zip(predictions.cpu().numpy(), targets.cpu().numpy()):
            pred_tokens = [token for token in pred if token != 0]  # Exclude padding
            target_tokens = [token for token in target if token != 0]  # Exclude padding
            bleu_scores.append(sentence_bleu([target_tokens], pred_tokens, smoothing_function=smoothing.method1))
        total_bleu_score += np.mean(bleu_scores)

    print(f"Average BLEU Score on Holdout: {total_bleu_score / len(train_dataloader)}")

Average BLEU Score on Holdout: 0.06511842757512007


In [None]:
# Step 4: Testing and Evaluation
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction


model.eval()
bleu_scores = []
with torch.no_grad():
    for inputs, targets in train_dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        predictions = torch.argmax(outputs, dim=2)

    smoothing = SmoothingFunction()
    for pred, target in zip(predictions.cpu().numpy(), targets.cpu().numpy()):
        pred_tokens = [token for token in pred if token != padding_idx]
        target_tokens = [token for token in target if token != padding_idx]
        bleu_scores.append(sentence_bleu([target_tokens], pred_tokens, smoothing_function=smoothing.method1))

print(f"Average BLEU Score with Smoothing: {np.mean(bleu_scores)}")


In [71]:
# Saving the RNN model
torch.save(model.state_dict(), "/content/drive/MyDrive/rnn_embedding_model.pth")

In [72]:
loaded_model = RNNWithEmbedding(input_vocab_size, output_vocab_size, embed_dim, hidden_dim, padding_idx)
loaded_model.load_state_dict(torch.load("/content/drive/MyDrive/rnn_embedding_model.pth"))
loaded_model.to(device)

  loaded_model.load_state_dict(torch.load("/content/drive/MyDrive/rnn_embedding_model.pth"))


RNNWithEmbedding(
  (embedding): Embedding(33501, 128, padding_idx=0)
  (rnn): RNN(128, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=59659, bias=True)
)