# Add punctuation to text:
### given a dataframe with the text in one column:

In [None]:
!pip install deepmultilingualpunctuation

In [None]:
from deepmultilingualpunctuation import PunctuationModel
import pandas as pd

df = pd.read_csv('text.csv')


def add_punctuation(text, model):
  return model.restore_punctuation(text)

model_punctuation = PunctuationModel()
df['text'] = df['text'].apply(add_punctuation, arg=(model_punctuation))


# create the embeddings

In [None]:
!pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
import re
model_embedding = SentenceTransformer('sentence-transformers/sentence-t5-base')

def split_text(text):
  split_text = re.split(r'[,.?!:;]', text)
  split_text = [part.strip() for part in split_text if part.strip()]
  return split_text


def generate_embeddings(text_list, model):
  return model.encode(text_list)


In [None]:
df['text'] = df['text'].apply(split_text)
df['embeddings'] = df['text'].apply(generate_embeddings)


# Pytorch model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F


def pad_and_convert_to_tensor(sequences, max_len=None, pad_value=0.0):

    if max_len is None:
        max_len = max(len(seq) for seq in sequences)

    padded_sequences = []
    for seq in sequences:
        padded_seq = np.pad(seq, ((0, max_len - len(seq)), (0, 0)), mode='constant', constant_values=pad_value)
        padded_sequences.append(padded_seq)

    padded_sequences = np.array(padded_sequences)
    return torch.tensor(padded_sequences, dtype=torch.float32)

class Masking(nn.Module):
    def __init__(self, mask_value=0.0):
        super(Masking, self).__init__()
        self.mask_value = mask_value

    def forward(self, x):
        mask = (x != self.mask_value).float()
        return x * mask, mask


class TimeDistributed(nn.Module):
    def __init__(self, module):
        super(TimeDistributed, self).__init__()
        self.module = module

    def forward(self, x):
        # x: (batch_size, time_steps, input_size)
        batch_size, time_steps, input_size = x.size()

        # Reshape input tensor to (batch_size * time_steps, input_size)
        x = x.contiguous().view(batch_size * time_steps, input_size)

        # Apply the module
        x = self.module(x)

        # Reshape the output to (batch_size, time_steps, output_size)
        output_size = x.size(-1)
        x = x.contiguous().view(batch_size, time_steps, output_size)

        return x

class BiRNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, mask_value=0.0):
        super(BiRNNModel, self).__init__()

        self.masking = Masking(mask_value=mask_value)
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.time_distributed = TimeDistributed(nn.Linear(hidden_size * 2, output_size))

    def forward(self, x):
        x, mask = self.masking(x)
        x, _ = self.rnn(x)
        x = self.time_distributed(x)
        return x, mask


In [None]:
padded_tensor = pad_and_convert_to_tensor(sequences, pad_value=0.0)


# Parameters
input_size = 10  # Number of input features
hidden_size = 20 # Number of features in the hidden state
output_size = 2  # Number of output features
mask_value = 0.0

# Initialize the model
model = BiRNNModel(input_size, hidden_size, output_size, mask_value)

# Dummy input (batch_size, time_steps, input_size)
dummy_input = torch.tensor([[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                             [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                             [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]]], dtype=torch.float32)

# Forward pass
output, mask = model(dummy_input)

print("Output:", output)
print("Mask:", mask)