In [2]:
!pip install -q fairseq2==v0.3.0rc1 --pre --extra-index-url  https://fair.pkg.atmeta.com/fairseq2/whl/rc/pt2.5.1/cu124 --upgrade
!pip install -q sonar-space

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sonar-space 0.4.0 requires fairseq2~=0.4.0, but you have fairseq2 0.3.0rc1 which is incompatible.
torchaudio 2.6.0+cu124 requires torch==2.6.0, but you have torch 2.5.1 which is incompatible.
torchvision 0.21.0+cu124 requires torch==2.6.0, but you have torch 2.5.1 which is incompatible.[0m[31m
[0m

In [3]:
!pip install -q  wtpsplit sonar

In [4]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cuda


In [5]:
import torch
import torch.nn as nn
from wtpsplit import SaT
from sonar.inference_pipelines.text import TextToEmbeddingModelPipeline
from sonar.inference_pipelines.text import EmbeddingToTextModelPipeline

In [6]:
# Transformer Model
class Transformer(nn.Module):
    def __init__(self, embd_dim, dim, layers, heads, dropout, device):
        super().__init__()
        self.embd_dim = embd_dim
        self.dim = dim
        self.layers = layers
        self.heads = heads
        self.dropout = dropout

        self.prenet = nn.Sequential(
            nn.LayerNorm(embd_dim),
            nn.Linear(embd_dim, dim),
            nn.ReLU(),
            nn.Dropout(dropout)  # Dropout to prevent overfitting
        )

        self.decoder = nn.ModuleList([
            nn.TransformerDecoderLayer(d_model=dim, nhead=heads, dropout=dropout) for _ in range(layers)
        ])

        self.postnet = nn.Sequential(
            nn.Linear(dim, embd_dim),
            nn.Softmax(dim=-1)  # Softmax to ensure valid probability distribution
        )

    def forward(self, x):
        x = self.prenet(x)
        for l in self.decoder:
            x = l(x, x)
        return self.postnet(x)

In [7]:
# LCM Model
class LCMModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.sat_sm = SaT(config.model_name)
        print("Splitter initialized")

        self.t2vec_model = TextToEmbeddingModelPipeline(
            encoder=config.sonar_enc, tokenizer=config.sonar_enc, device=torch.device(config.device)
        )
        print("Text-to-Vector model initialized")

        self.transformer = Transformer(
            config.embd_dim, config.dim, config.layers, config.heads, config.dropout, config.device
        ).to(config.device)
        print("Transformer initialized")

        self.vec2text_model = EmbeddingToTextModelPipeline(
            decoder=config.sonar_dec, tokenizer=config.sonar_dec, device=torch.device(config.device)
        )
        print("Vector-to-Text model initialized")

    def split_into_concepts(self, text):
        return self.sat_sm.split(text, threshold=self.config.threshold)

    def forward(self, embeddings):
        out_embeddings = self.transformer.forward(embeddings)
        return out_embeddings

    def generate(self, text, num_generated_concepts=1):
        with torch.no_grad():
            concepts = self.split_into_concepts(text)
            print("\nInitial Concepts:", concepts)  # Debugging

            for c in range(num_generated_concepts):
                embeddings = self.t2vec_model.predict(concepts, source_lang=self.config.lang)
                print("\nEmbeddings:", embeddings)  # Debugging

                out_embeddings = self.forward(embeddings)
                print("\nTransformed Embeddings:", out_embeddings)  # Debugging

                # Removed 'num_beams' to prevent TypeError
                next_concept = self.vec2text_model.predict(
                    out_embeddings, target_lang=self.config.lang, max_seq_len=self.config.max_seq_len
                )
                print("\nGenerated Concept:", next_concept)  # Debugging

                concepts.append(next_concept[0])

        return " ".join(concepts)  # Return as a proper sentence


In [8]:
# Configuration Class
class LCMConfig:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        # Transformer args
        self.embd_dim = 1024  # Dimension of SONAR embeddings
        self.dim = 1024       # Keep this close to embedding size
        self.layers = 2       # Reduce layers for better optimization
        self.heads = 8        # Number of attention heads
        self.dropout = 0.1    # Add dropout to prevent overfitting

        # Sonar args
        self.lang = "eng_Latn"
        self.max_seq_len = 256
        self.sonar_enc = "text_sonar_basic_encoder"
        self.sonar_dec = "text_sonar_basic_decoder"

        # wtpsplit args
        self.model_name = "sat-1l-sm"
        self.threshold = 0.05

In [9]:
# Initialize and Run
config = LCMConfig()
lcm = LCMModel(config)

text = "This is a test sentence."
output = lcm.generate(text, num_generated_concepts=2)
print("\nGenerated Output:", output)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Splitter initialized
Text-to-Vector model initialized
Transformer initialized
Vector-to-Text model initialized

Initial Concepts: ['This is a test sentence.']

Embeddings: tensor([[ 0.0013, -0.0023, -0.0098,  ..., -0.0089,  0.0024, -0.0056]],
       device='cuda:0')

Transformed Embeddings: tensor([[0.0006, 0.0008, 0.0005,  ..., 0.0005, 0.0009, 0.0023]],
       device='cuda:0')

Generated Concept: ["In the meantime, I'm going to share with you some of the things that I've learned from the past, and I'm going to share with you some of the things that I've learned from the past."]

Embeddings: tensor([[ 0.0013, -0.0023, -0.0098,  ..., -0.0089,  0.0024, -0.0056],
        [-0.0080,  0.0067,  0.0058,  ..., -0.0055, -0.0006,  0.0126]],
       device='cuda:0')

Transformed Embeddings: tensor([[0.0010, 0.0015, 0.0006,  ..., 0.0005, 0.0010, 0.0016],
        [0.0013, 0.0009, 0.0012,  ..., 0.0011, 0.0016, 0.0027]],
       device='cuda:0')

Generated Concept: ['In the meantime, I\'m going to share