In [None]:
import torch.optim as optim
import re
from tokenizers import models, pre_tokenizers, trainers, Tokenizer
import os
import torch.nn as nn
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from torch.utils.data import Dataset
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:


def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s.,!?;:\'"]+', '', text)
    return text

couplets_songs = os.listdir('data/Verse')
couplets_lyrics = []

couplets_files = [os.path.join("data/Verse", song) for song in couplets_songs]

if "data/Verse/.DS_Store" in couplets_files:
    couplets_files.remove("data/Verse/.DS_Store")
for file in couplets_files:
    with open(file, 'r') as f:
        for line in f.readlines():
            if len(" ".join(line.split())) > 0:
                couplets_lyrics.append(" ".join(line.split()))

In [None]:
couplets_lens = [len(line.split()) for line in couplets_lyrics]
couplets_lens.sort()
print(couplets_lens[0], couplets_lens[-1])

In [None]:
while '' in couplets_lyrics:
    couplets_lyrics.remove('')

In [None]:
couplets_lens = [len(line.split()) for line in couplets_lyrics]
couplets_lens.sort()
for line in couplets_lyrics:
    line = clean_text(line)
print(couplets_lens[0], couplets_lens[-1])

In [None]:
with open("couplets_lyrics.txt", "w", encoding="utf-8") as f:
    for line in couplets_lyrics:
        f.write(line.strip() + "\n")

In [None]:
from tokenizers import models, pre_tokenizers, trainers, Tokenizer

bpe_model = models.BPE(unk_token="<unk>")

trainer = trainers.BpeTrainer(
    vocab_size=8000,
    special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
)

tokenizer = Tokenizer(bpe_model)
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

tokenizer.train(["couplets_lyrics.txt"], trainer)

tokenizer.save("tokenizer.json")

In [None]:
tokenizer = Tokenizer.from_file("tokenizer.json")

In [None]:
def preprocess_line(line):
    return f"<bos> {line.strip()} <eos>"
tokenized_ids = [tokenizer.encode(preprocess_line(line)).ids for line in couplets_lyrics]

In [None]:

tensor_seqs = [torch.tensor(ids) for ids in tokenized_ids]

padded_seqs = pad_sequence(tensor_seqs, batch_first=True, padding_value=0)

print(padded_seqs.shape)

In [None]:
import torch
import math

def get_sinusoidal_positional_encoding(seq_len, d_model):
    pos = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
    i = torch.arange(0, d_model, 2, dtype=torch.float)
    angle_rates = 1 / torch.pow(10000, i / d_model)

    pe = torch.zeros(seq_len, d_model)
    pe[:, 0::2] = torch.sin(pos * angle_rates)
    pe[:, 1::2] = torch.cos(pos * angle_rates)

    return pe

In [None]:
positional_encoding = get_sinusoidal_positional_encoding(seq_len=28, d_model=512)

In [None]:
vocab_size = 8000
embedding_dim = 512
token_embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

x = token_embedding(padded_seqs)  # shape : (batch_size, seq_len, embedding_dim)

In [None]:
pe = get_sinusoidal_positional_encoding(seq_len=padded_seqs.shape[1], d_model=embedding_dim)
pe = pe.unsqueeze(0).to(x.device)  # shape : (1, seq_len, d_model)

In [None]:
x = x + pe

In [None]:
examples = []
for ids in tokenized_ids:  # chaque ligne tokenisée, ex: [2, 45, 17, 311, 892, 3]
    if len(ids) < 2:
        continue
    input_seq = ids[:-1]
    target_seq = ids[1:]
    examples.append((input_seq, target_seq))

In [None]:


class TextGenerationDataset(Dataset):
    def __init__(self, examples, pad_token_id=0):
        self.examples = examples
        self.pad_token_id = pad_token_id

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        x, y = self.examples[idx]
        return torch.tensor(x), torch.tensor(y)

In [None]:


def collate_fn(batch):
    xs, ys = zip(*batch)
    xs = pad_sequence(xs, batch_first=True, padding_value=0)
    ys = pad_sequence(ys, batch_first=True, padding_value=0)
    return xs, ys

In [None]:
dataset = TextGenerationDataset(examples)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

In [None]:


class SimpleTransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=4, num_layers=4, dim_feedforward=256, max_len=512, pad_token_id=0):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, d_model, padding_idx=pad_token_id)
        self.pos_embedding = nn.Embedding(max_len, d_model)
        self.dropout = nn.Dropout(0.1)

        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout=0.1, batch_first=False)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)

        self.fc_out = nn.Linear(d_model, vocab_size)
        self.d_model = d_model
        self.pad_token_id = pad_token_id

    def forward(self, x_input):
        batch_size, seq_len = x_input.size()

        positions = torch.arange(x_input.size(1), device=x_input.device)
        positions = positions.unsqueeze(0).expand(x_input.size(0), -1)
        pos_embed = self.pos_embedding(positions)
        x = self.dropout(self.token_embedding(x_input) + pos_embed)

        x = x.transpose(0, 1)

        mask = nn.Transformer.generate_square_subsequent_mask(seq_len).to(x.device)
        src_key_padding_mask = (x_input == self.pad_token_id)

        out = self.transformer(x, mask=mask, src_key_padding_mask=src_key_padding_mask)
        out = out.transpose(0, 1)

        return self.fc_out(out)

In [None]:

device = torch.device("mps" if torch.mps.is_available() else "cpu")
model = SimpleTransformerModel(vocab_size=vocab_size).to(device)
pad_token_id = tokenizer.token_to_id("<pad>")
criterion = nn.CrossEntropyLoss(ignore_index=pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [None]:
from tqdm import tqdm

num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for x_batch, y_batch in tqdm(dataloader):
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)

        logits = model(x_batch)

        loss = criterion(
            logits.view(-1, logits.size(-1)),
            y_batch.view(-1)
        )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}: loss = {total_loss/len(dataloader):.4f}")

In [None]:
import torch
import torch.nn.functional as F

def generate(model, tokenizer, seed_text="<bos>", max_len=50, temperature=1.0, eos_token="<eos>", device="cpu"):
    model.eval()

    input_ids = tokenizer.encode(seed_text).ids
    input_ids = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device)

    generated = input_ids

    for _ in range(max_len):
        with torch.no_grad():
            logits = model(generated)

        next_token_logits = logits[:, -1, :] / temperature

        probs = F.softmax(next_token_logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)

        generated = torch.cat((generated, next_token), dim=1)

        if next_token.item() == tokenizer.token_to_id(eos_token):
            break

    result = tokenizer.decode(generated.squeeze().tolist(), skip_special_tokens=True)
    return result

In [None]:
seed = "<bos> let's make it"
song = "let's make it"
for i in range(90):
    generated_text = generate(model, tokenizer, seed_text=seed, device=device)
    song+=" " + generated_text
print(song)