In [1]:
from sympy import symbols, sin, cos, exp, ln, log, tan, asin, atan,cot
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Dataset Generation and Tokenization

In [2]:
# Dataset Preprocessing
class TaylorDataset:
    def __init__(self, order, functions=None):
        self.order = order
        self.functions = functions if functions else self.default_functions()
        self.vocab_to_int = None
        self.int_to_vocab = None

    @staticmethod
    def default_functions():
        x = symbols('x')
        return [
            sin(x), cos(x), exp(x), ln(1 + x), log(1 + x, 10),
            1 / (1 + x), x ** 2 + x + 1, tan(x), asin(x), atan(x),exp(sin(x)),exp(tan(x)), cot(x),
            1/(1+x**2), exp(x)*(1+x), exp(x)*(1-x), 1/(1+x)**2, 1/(1-x)**2, 1/(1-x)**3, 1/(1-x**2), log(3+4*x),
            1/(1+x), 1/(1+x)**2, 1/(1-x), -ln(1-x)
        ]

    def generate(self):
        x = symbols('x')
        data = []
        for func in self.functions:
            expansion = func.series(x, 0, self.order + 1).removeO()
            data.append({"function": str(func), "expansion": str(expansion)})
        return pd.DataFrame(data).sample(frac=1, random_state=42, ignore_index=True)

    def tokenize(self, df):
        # Tokenize both function and expansion strings.
        # For Taylor expansion tokens, add <SOS> at start and <EOS> at end.
        tokens = []
        for _, row in df.iterrows():
            tokens.extend(word_tokenize(row['function']))
            # add <SOS> and <EOS> for expansions
            exp_tokens = ['<SOS>'] + word_tokenize(row['expansion']) + ['<EOS>']
            tokens.extend(exp_tokens)

        counter = Counter(tokens)
        vocab = sorted(counter, key=counter.get, reverse=True)
        # Ensure special tokens exist:
        for special in ['<SOS>', '<EOS>', '<UNK>']:
            if special not in vocab:
                vocab.append(special)

        self.vocab_to_int = {token: i for i, token in enumerate(vocab, 1)}
        self.int_to_vocab = {i: token for token, i in self.vocab_to_int.items()}

        tokenized_data = {"function_tokens": [], "expansion_tokens": []}

        for _, row in df.iterrows():
            func_tokens = [self.vocab_to_int.get(token, self.vocab_to_int["<UNK>"]) 
                           for token in word_tokenize(row["function"])]
            exp_tokens = (['<SOS>'] + word_tokenize(row["expansion"]) + ['<EOS>'])
            exp_tokens = [self.vocab_to_int.get(token, self.vocab_to_int["<UNK>"]) for token in exp_tokens]
            tokenized_data["function_tokens"].append(func_tokens)
            tokenized_data["expansion_tokens"].append(exp_tokens)
        
        return pd.DataFrame(tokenized_data)
    
    def get_token_dicts(self):
        return self.vocab_to_int, self.int_to_vocab

In [12]:
# Initialize Dataset
order = 4
taylor_dataset = TaylorDataset(order)
df = taylor_dataset.generate()
df

Unnamed: 0,function,expansion
0,asin(x),x**3/6 + x
1,(x + 1)**(-2),5*x**4 - 4*x**3 + 3*x**2 - 2*x + 1
2,sin(x),-x**3/6 + x
3,1/(1 - x),x**4 + x**3 + x**2 + x + 1
4,exp(tan(x)),3*x**4/8 + x**3/2 + x**2/2 + x + 1
5,atan(x),-x**3/3 + x
6,1/(x**2 + 1),x**4 - x**2 + 1
7,cos(x),x**4/24 - x**2/2 + 1
8,(x + 1)**(-2),5*x**4 - 4*x**3 + 3*x**2 - 2*x + 1
9,1/(x + 1),x**4 - x**3 + x**2 - x + 1


In [14]:
tokenized_df = taylor_dataset.tokenize(df)
tokenized_df

Unnamed: 0,function_tokens,expansion_tokens
0,"[31, 2, 4, 3]","[6, 24, 1, 4, 7]"
1,"[2, 4, 1, 5, 3, 14, 2, 18, 3]","[6, 19, 8, 20, 1, 21, 8, 15, 1, 5, 7]"
2,"[25, 2, 4, 3]","[6, 32, 1, 4, 7]"
3,"[11, 2, 5, 8, 4, 3]","[6, 12, 1, 22, 1, 9, 1, 4, 1, 5, 7]"
4,"[23, 2, 26, 2, 4, 3, 3]","[6, 33, 1, 34, 1, 10, 1, 4, 1, 5, 7]"
5,"[35, 2, 4, 3]","[6, 36, 1, 4, 7]"
6,"[11, 2, 9, 1, 5, 3]","[6, 12, 8, 9, 1, 5, 7]"
7,"[37, 2, 4, 3]","[6, 27, 8, 10, 1, 5, 7]"
8,"[2, 4, 1, 5, 3, 14, 2, 18, 3]","[6, 19, 8, 20, 1, 21, 8, 15, 1, 5, 7]"
9,"[11, 2, 4, 1, 5, 3]","[6, 12, 8, 22, 1, 9, 8, 4, 1, 5, 7]"


# PyTorch Dataset and Collate Function

In [3]:
# PyTorch Dataset
class TrainDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        data = self.dataset.iloc[idx]
        # For seq2seq, we need both an encoder input and a decoder target.
        # Here, function_tokens are encoder input; expansion_tokens are decoder target.
        function_tensor = torch.tensor(data['function_tokens'], dtype=torch.long)
        expansion_tensor = torch.tensor(data['expansion_tokens'], dtype=torch.long)
        return function_tensor, expansion_tensor

In [5]:
# Collate function to pad sequences
def collate_fn(batch):
    # batch: list of (src, trg) pairs
    src_seqs, trg_seqs = zip(*batch)
    src_lengths = [len(s) for s in src_seqs]
    trg_lengths = [len(t) for t in trg_seqs]
    src_padded = nn.utils.rnn.pad_sequence(src_seqs, batch_first=True, padding_value=0)
    trg_padded = nn.utils.rnn.pad_sequence(trg_seqs, batch_first=True, padding_value=0)
    return src_padded, trg_padded

# LSTM Encoder-Decoder based model

In [6]:
# Encoder
class Encoder(nn.Module):
    def __init__(self, input_dim, embed_size, hidden_size, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embed_size, padding_idx=0)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
    
    def forward(self, src):
        # src: [batch, src_len]
        embedded = self.embedding(src)  # [batch, src_len, embed_size]
        outputs, (hidden, cell) = self.lstm(embedded)
        return hidden, cell

# Decoder
class Decoder(nn.Module):
    def __init__(self, output_dim, embed_size, hidden_size, num_layers):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, embed_size, padding_idx=0)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_dim)
        self.log_softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, input, hidden, cell):
        # input: [batch] -> we want [batch, 1]
        input = input.unsqueeze(1)
        embedded = self.embedding(input)  # [batch, 1, embed_size]
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))  # output: [batch, 1, hidden_size]
        prediction = self.log_softmax(self.fc(output.squeeze(1)))  # [batch, output_dim]
        return prediction, hidden, cell

# Seq2Seq Model
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        # src: [batch, src_len]
        # trg: [batch, trg_len]
        batch_size = src.size(0)
        trg_len = trg.size(1)
        output_dim = self.decoder.output_dim
        
        outputs = torch.zeros(batch_size, trg_len, output_dim).to(self.device)
        hidden, cell = self.encoder(src)
        # first input to decoder is the <SOS> token
        input = trg[:, 0]  # [batch]
        
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t, :] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)  # [batch]
            input = trg[:, t] if teacher_force else top1
        return outputs

# Training Class for Transformer

In [7]:
# Training Class
class Train:
    def __init__(self, epoch, batch_size, input_dim, embed_size, hidden_size, num_layers, output_dim):
        self.epoch = epoch
        self.batch_size = batch_size
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        encoder = Encoder(input_dim, embed_size, hidden_size, num_layers)
        decoder = Decoder(output_dim, embed_size, hidden_size, num_layers)
        self.model = Seq2Seq(encoder, decoder, self.device).to(self.device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
        self.criterion = nn.NLLLoss(ignore_index=0)
    
    def run(self, dataloader, trg_pad_idx=0):
        for epoch in range(self.epoch):
            self.model.train()
            epoch_loss = 0
            for src, trg in dataloader:
                src, trg = src.to(self.device), trg.to(self.device)
                self.optimizer.zero_grad()
                output = self.model(src, trg)
                # output: [batch, trg_len, output_dim]
                # trg: [batch, trg_len]
                # flatten both for loss computation:
                output = output[:, 1:].reshape(-1, output.shape[-1])  # skip first token (<SOS>) prediction
                trg = trg[:, 1:].reshape(-1)
                loss = self.criterion(output, trg)
                loss.backward()
                self.optimizer.step()
                epoch_loss += loss.item()
            if (epoch + 1) % 100 == 0 or epoch == 0:
                print(f'Epoch {epoch+1} - Loss: {epoch_loss:.4f}')
    
    def get_model(self):
        return self.model.to("cpu")

# Initialize Dataset and Training Setup

In [8]:
# PyTorch Training Setup
vocab_to_int, int_to_vocab = taylor_dataset.get_token_dicts()
dataset = TrainDataset(tokenized_df)
train_loader = DataLoader(dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)

# Hyperparameters
epoch = 500
batch_size = 1
embed_size = 32
hidden_size = 64
num_layers = 2
input_dim = len(vocab_to_int) + 1  # +1 for padding idx=0
output_dim = len(vocab_to_int) + 1

# Train Model
trainer = Train(epoch, batch_size, input_dim, embed_size, hidden_size, num_layers, output_dim)
trainer.run(train_loader)

Epoch 1 - Loss: 102.4220
Epoch 100 - Loss: 1.8235
Epoch 200 - Loss: 0.1597
Epoch 300 - Loss: 0.0348
Epoch 400 - Loss: 0.0089
Epoch 500 - Loss: 0.0024


# Prediction function

In [9]:
def predict_sample(model, src_tensor, vocab_to_int, int_to_vocab, max_len=30):
    """
    Predicts the output sequence for a given input sequence (src_tensor) using the trained model.
    
    Args:
        model: The trained Seq2Seq model.
        src_tensor: Tensor containing the tokenized input sequence (1D tensor).
        vocab_to_int: Dictionary mapping tokens to indices.
        int_to_vocab: Dictionary mapping indices to tokens.
        max_len: Maximum number of tokens to generate.
    
    Returns:
        List of tokens representing the predicted expansion (without <SOS> token).
    """
    model.eval()
    device = next(model.parameters()).device
    
    # Add batch dimension and send to device
    src_tensor = src_tensor.unsqueeze(0).to(device)
    
    # Encode the input sequence
    with torch.no_grad():
        hidden, cell = model.encoder(src_tensor)
    
    # First decoder input is <SOS>
    sos_token = vocab_to_int["<SOS>"]
    eos_token = vocab_to_int["<EOS>"]
    input_token = torch.tensor([sos_token], device=device)
    
    predicted_tokens = []
    
    # Decode one token at a time
    for _ in range(max_len):
        with torch.no_grad():
            output, hidden, cell = model.decoder(input_token, hidden, cell)
        top1 = output.argmax(1).item()
        if top1 == eos_token:
            break
        predicted_tokens.append(top1)
        input_token = torch.tensor([top1], device=device)
    
    # Convert token indices to words
    predicted_words = [int_to_vocab[token] for token in predicted_tokens]
    return predicted_words

# Assume 'trainer' is your training instance from the previous code and has been trained.
trained_model = trainer.get_model()
trained_model.eval()

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(67, 32, padding_idx=0)
    (lstm): LSTM(32, 64, num_layers=2, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(67, 32, padding_idx=0)
    (lstm): LSTM(32, 64, num_layers=2, batch_first=True)
    (fc): Linear(in_features=64, out_features=67, bias=True)
    (log_softmax): LogSoftmax(dim=1)
  )
)

In [10]:
# Pick a sample input from the dataset (e.g., first sample)
sample_input, sample_target = dataset[0]

# Predict the expansion using the sample function tokens
predicted_expansion = predict_sample(trained_model, sample_input, vocab_to_int, int_to_vocab)
print("Function Tokens (input):", [int_to_vocab[token] for token in sample_input.tolist()])
print("Predicted Expansion:", " ".join(predicted_expansion))

Function Tokens (input): ['asin', '(', 'x', ')']
Predicted Expansion: x**3/6 + x


In [11]:
# Pick a sample input from the dataset (e.g., first sample)
sample_input, sample_target = dataset[1]

# Predict the expansion using the sample function tokens
predicted_expansion = predict_sample(trained_model, sample_input, vocab_to_int, int_to_vocab)
print("Function Tokens (input):", [int_to_vocab[token] for token in sample_input.tolist()])
print("Predicted Expansion:", " ".join(predicted_expansion))

Function Tokens (input): ['(', 'x', '+', '1', ')', '**', '(', '-2', ')']
Predicted Expansion: 5*x**4 - 4*x**3 + 3*x**2 - 2*x + 1
