In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import re
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

# Download necessary NLTK data
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yashp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
filepath = r"C:\Users\yashp\en_te.csv"
df = pd.read_csv(filepath)
print(df.head(10))  # Check initial data
print(df.columns)

             english                 telugu
0              Hello                    హలో
1      Are you good?       మీరు బాగున్నారా?
2         I am happy  నేను సంతోషంగా ఉన్నాను
3       how are you?      మీరు ఎలా ఉన్నారు?
4          I am good        నేను భాగున్నాను
5      Are you good?       మీరు బాగున్నారా?
6         I am angry    నేను కోపంగా ఉన్నాను
7           I am sad  నేను విచారంగా ఉన్నాను
8  This is my friend      ఇది నా స్నేహితుడు
9    This is my home           ఇది నా ఇల్లు
Index(['english', 'telugu'], dtype='object')


In [3]:
df.isnull().sum()

english    0
telugu     0
dtype: int64

In [4]:
df.dropna(inplace=True)

In [5]:
df.isnull().sum()

english    0
telugu     0
dtype: int64

In [6]:
df.head()

Unnamed: 0,english,telugu
0,Hello,హలో
1,Are you good?,మీరు బాగున్నారా?
2,I am happy,నేను సంతోషంగా ఉన్నాను
3,how are you?,మీరు ఎలా ఉన్నారు?
4,I am good,నేను భాగున్నాను


In [10]:
# Load dataset
'''def load_data(filepath):
    df = pd.read_csv(filepath)
    df.columns = ["English", "Marathi"]
    df.dropna(inplace=True)
    return df'''

def load_data(filepath):
    df = pd.read_csv(filepath)  # Load the CSV without renaming columns
    df = df[['english', 'telugu']]  # Keep only relevant columns
    df.dropna(inplace=True)  # Remove any missing values
    return df

# Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    return text

# Preprocess dataset
def preprocess_data(df):
    df["english"] = df["english"].apply(clean_text)
    df["telugu"] = df["telugu"]
    return df

# Tokenization
def tokenize_text(df):
    engHT_tokens = [word_tokenize(sent) for sent in df['english']]
    TL_tokens = [word_tokenize(sent) for sent in df['telugu']]
    return engHT_tokens, TL_tokens

# Build vocabulary
def build_vocab(tokenized_texts):
    vocab = {word for sentence in tokenized_texts for word in sentence}
    vocab = {word: idx + 1 for idx, word in enumerate(vocab)}
    vocab['<PAD>'] = 0
    return vocab

# Encode sentences
def encode_sentences(sentences, vocab, max_len):
    encoded = [[vocab.get(word, 0) for word in sent] for sent in sentences]
    padded = [sent + [0] * (max_len - len(sent)) if len(sent) < max_len else sent[:max_len] for sent in encoded]
    return np.array(padded)

# PyTorch Dataset Class
class TranslationDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# LSTM Model Class
class LSTMTranslator(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
        super(LSTMTranslator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True, dropout=0.6, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)
        self.dropout = nn.Dropout(0.6)  # Higher dropout

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.dropout(x)  # Apply dropout before FC
        x = self.fc(x)
        return x



# Load and process data
filepath = r"C:\Users\yashp\en_te.csv"
df = load_data(filepath)
df = preprocess_data(df)
engHT_tokens, TL_tokens = tokenize_text(df)
engHT_vocab = build_vocab(engHT_tokens)
TL_vocab = build_vocab(TL_tokens)

# Set max sequence length
max_len = 10
X = encode_sentences(engHT_tokens, engHT_vocab, max_len)
y = encode_sentences(TL_tokens, TL_vocab, max_len)

In [11]:
df.head()

Unnamed: 0,english,telugu
0,hello,హలో
1,are you good,మీరు బాగున్నారా?
2,i am happy,నేను సంతోషంగా ఉన్నాను
3,how are you,మీరు ఎలా ఉన్నారు?
4,i am good,నేను భాగున్నాను


In [12]:
df.tail()

Unnamed: 0,english,telugu
10200,that was how he came to win 1 million,అతను million 1 మిలియన్ గెలిచాడు.
10201,jim liked driving around town with his hazard ...,జిమ్ తన ప్రమాదకర లైట్లతో పట్టణం చుట్టూ డ్రైవిం...
10202,so long and thanks for the fish,చాలా కాలం మరియు చేపలకు ధన్యవాదాలు.
10203,he barked orders at his daughters but they jus...,"అతను తన కుమార్తెల వద్ద ఆదేశాలను మొరాయిస్తాడు, ..."
10204,barking dogs and screaming toddlers have the u...,మొరిగే కుక్కలు మరియు అరుస్తూ పసిబిడ్డలు స్నేహప...


In [13]:
# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [14]:
# Prepare DataLoader
train_dataset = TranslationDataset(X_train, y_train)
val_dataset = TranslationDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [15]:
# Define Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMTranslator(len(engHT_vocab), 128, 512, len(TL_vocab)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)



In [16]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if device.type == 'cuda':
    print(f"Using CUDA device: {torch.cuda.get_device_name(0)}") # Get the name of the CUDA device
else:
    print("Using CPU")

print(f"Device type: {device.type}") # Print the type of device

Using CUDA device: NVIDIA GeForce RTX 4060 Laptop GPU
Device type: cuda


In [None]:
from tqdm import tqdm
from tqdm import tqdm
from torch.optim.lr_scheduler import ReduceLROnPlateau

epochs = 30
train_losses, val_losses = [], []
best_val_loss = float("inf")
patience, patience_counter = 5, 0
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

scheduler = ReduceLROnPlateau(optimizer, mode="min", patience=2, factor=0.5, verbose=True)

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output.view(-1, output.shape[-1]), y_batch.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    train_losses.append(total_loss / len(train_loader))
    
    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            output = model(X_batch)
            loss = criterion(output.view(-1, output.shape[-1]), y_batch.view(-1))
            val_loss += loss.item()
    
    val_losses.append(val_loss / len(val_loader))
    scheduler.step(val_losses[-1])  # Adjust learning rate
    
    print(f"Epoch {epoch+1}, Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}")
    
    # Early Stopping
    if val_losses[-1] < best_val_loss:
        best_val_loss = val_losses[-1]
        torch.save(model.state_dict(), "best_translator_model.pth")
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break



Epoch 1, Train Loss: 6.6576, Val Loss: 5.9517
Epoch 2, Train Loss: 5.1750, Val Loss: 4.7533
Epoch 3, Train Loss: 3.8455, Val Loss: 4.0729
Epoch 4, Train Loss: 2.9693, Val Loss: 3.7880
Epoch 5, Train Loss: 2.3411, Val Loss: 3.6356
Epoch 6, Train Loss: 1.8358, Val Loss: 3.5870
Epoch 7, Train Loss: 1.4651, Val Loss: 3.5867
Epoch 8, Train Loss: 1.2107, Val Loss: 3.6060
Epoch 9, Train Loss: 1.0358, Val Loss: 3.6045
Epoch 10, Train Loss: 0.7920, Val Loss: 3.5917
Epoch 11, Train Loss: 0.6833, Val Loss: 3.6191


In [None]:
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Val Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
import pickle
torch.save(model.state_dict(), "translator_model_English_to_telugu.pth")
pickle.dump(engHT_vocab, open("engHT_vocab.pkl", "wb"))
pickle.dump(TL_vocab, open("TL_vocab.pkl", "wb"))

In [None]:
def translate_sentence(model, sentence, engHT_vocab, TL_vocab, max_len=10, device="cuda"):
    model.eval()
    tokens = word_tokenize(sentence.lower())
    encoded = [engHT_vocab.get(word, 0) for word in tokens]
    padded = encoded + [0] * (max_len - len(encoded)) if len(encoded) < max_len else encoded[:max_len]
    input_tensor = torch.tensor([padded], dtype=torch.long).to(device)

    with torch.no_grad():
        output = model(input_tensor)
    
    predicted_tokens = output.argmax(2).squeeze().tolist()
    rev_TL_vocab = {idx: word for word, idx in TL_vocab.items()}
    translated_sentence = " ".join([rev_TL_vocab.get(idx, "") for idx in predicted_tokens])

    return translated_sentence

In [None]:
# Example usage
loaded_model = LSTMTranslator(len(engHT_vocab), 128, 512, len(TL_vocab)).to(device)
loaded_model.load_state_dict(torch.load("translator_model_English_to_telugu.pth"))
loaded_model.eval()

example_sentence = "Thank You"
predicted_translation = translate_sentence(loaded_model, example_sentence, engHT_vocab, TL_vocab, device=device)
print(f"Translation: {predicted_translation}")