In [None]:
# Sample sentences
sentences = [
    "hello there",
    "how are you",
    "good morning",
    "see you soon",
    "what is your name",
    "i love programming",
    "where are you from",
    "nice to meet you"
]

# Tokenize the sentences
tokenized_sentences = [sentence.split() for sentence in sentences]

# Create a vocabulary from the unique words in the sentences
vocab = set(word for sentence in tokenized_sentences for word in sentence)
vocab_size = len(vocab)

# Create a word-to-index and index-to-word mapping
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}

# Convert sentences to their integer representation
encoded_sentences_word_level = [[word2idx[word] for word in sentence] for sentence in tokenized_sentences]

vocab_size, encoded_sentences_word_level


In [None]:
import re

def load_and_preprocess_wikitext(file_path, max_sentences=1000):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    # Use regular expressions to split the text into sentences
    sentences = re.split(r'(?<=[.!?])\s+', text)
    sentences = [sentence.strip() for sentence in sentences if len(sentence.split()) <= MAX_LENGTH]
    
    return sentences[:max_sentences]

# Adjust the file_path to point to your train.txt from WikiText-2
file_path = "wikitext-2/wiki.train.tokens"
wikitext_sentences = load_and_preprocess_wikitext(file_path)

# Print the first few sentences to check
wikitext_sentences[:10]


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import copy

In [2]:
# Define constants
MAX_LENGTH = 10  # Maximum sentence length
VOCAB_SIZE = 50  # For simplicity, let's assume 50 unique tokens (including padding, start, end tokens)
EMBEDDING_DIM = 256
HIDDEN_DIM = 256
LATENT_DIM = 32  # Dimension of the latent space

# Gumbel softmax temperature
TAU = 1.0


In [3]:
import re

def load_and_preprocess_wikitext(file_path, max_sentences=1000):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    # Use regular expressions to split the text into sentences
    sentences = re.split(r'(?<=[.!?])\s+', text)
    sentences = [sentence.strip() for sentence in sentences if len(sentence.split()) <= MAX_LENGTH]
    
    return sentences[:max_sentences]

# Adjust the file_path to point to your train.txt from WikiText-2
file_path = "wikitext-2/wiki.train.tokens"
wikitext_sentences = load_and_preprocess_wikitext(file_path)

# Print the first few sentences to check
wikitext_sentences[:10]


['A large team of writers handled the script .',
 '<unk> can switch classes by changing their assigned weapon .',
 'Development work took approximately one year .',
 'The newer systems were decided upon early in development .',
 'The main color of the Nameless was black .',
 'The anime opening was produced by Production I.G.',
 'The game was released January 27 , 2011 .',
 "He also positively noted the story 's serious tone .",
 'The anime was first announced in November 2010 .',
 'Lee of the U.S.']

In [4]:
# Define the Transformer-based CVAE model
class TransformerEncoder(nn.Module):
    def __init__(self, d_model=EMBEDDING_DIM, nhead=4, num_layers=2):
        super(TransformerEncoder, self).__init__()
        self.embedding = nn.Embedding(VOCAB_SIZE, d_model)
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model, nhead), num_layers
        )
        self.fc_logits = nn.Linear(d_model, LATENT_DIM)

    def forward(self, x):
        embedded = self.embedding(x).permute(1, 0, 2)  # Transformer expects seq_len, batch, features
        transformed = self.transformer_encoder(embedded)
        # Use the final state to predict logits for latent space
        logits = self.fc_logits(transformed[-1])
        return logits


class TransformerDecoder(nn.Module):
    def __init__(self, d_model=EMBEDDING_DIM, nhead=4, num_layers=2):
        super(TransformerDecoder, self).__init__()
        self.embedding = nn.Embedding(VOCAB_SIZE, d_model)
        self.transformer_decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model, nhead), num_layers
        )
        self.fc_out = nn.Linear(d_model, VOCAB_SIZE)
        self.fc_z = nn.Linear(LATENT_DIM, d_model)  # Convert z to feature size for transformer

    def forward(self, x, z):
        embedded = self.embedding(x).permute(1, 0, 2)
        z_adjusted = self.fc_z(z).unsqueeze(0)
        output = self.transformer_decoder(embedded, z_adjusted)
        return self.fc_out(output.permute(1, 0, 2))


class TransformerCVAE(nn.Module):
    def __init__(self):
        super(TransformerCVAE, self).__init__()
        self.encoder = TransformerEncoder()
        self.decoder = TransformerDecoder()

    def reparameterize(self, logits):
        return F.gumbel_softmax(logits, tau=TAU, hard=True, dim=-1)

    def forward(self, x):
        logits = self.encoder(x)
        z = self.reparameterize(logits)
        return self.decoder(x, z), logits

transformer_cvae = TransformerCVAE()

# Check model architecture
transformer_cvae


TransformerCVAE(
  (encoder): TransformerEncoder(
    (embedding): Embedding(50, 256)
    (transformer_encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-1): 2 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
          )
          (linear1): Linear(in_features=256, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=256, bias=True)
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (fc_logits): Linear(in_features=256, out_features=32, bias=True)
  )
  (decoder): TransformerDecoder(
    (embedding): Embedding(50, 256)
    (transformer_deco

In [5]:
# loss function
def loss_function(recon_x, x, logits):
    recon_loss = F.cross_entropy(recon_x.permute(0, 2, 1), x, reduction='sum')
    
    # Regularization loss: entropy of the logits
    probs = F.softmax(logits, dim=-1)
    log_probs = F.log_softmax(logits, dim=-1)
    reg_loss = -torch.sum(probs * log_probs)
    
    return recon_loss + reg_loss

# Initialize optimizer for the new model
optimizer_gumbel = torch.optim.Adam(transformer_cvae.parameters())

# 3. Simulate a small dataset
num_samples = 100
random_sentences = torch.randint(0, VOCAB_SIZE, (num_samples, MAX_LENGTH))

# Training loop for the new model
def train(epoch, data, model, optimizer):
    model.train()
    train_loss = 0
    for i in range(len(data)):
        sentence = data[i].unsqueeze(0)
        optimizer.zero_grad()
        recon_sentence, logits = model(sentence)
        loss = loss_function(recon_sentence, sentence, logits)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    
    print(f"Epoch: {epoch}, Loss: {train_loss / len(data)}")

# # Train the new model for 10 epochs as a start
# for epoch in range(1, 11):
#     train(epoch, random_sentences, transformer_cvae, optimizer_gumbel)


In [6]:
# Define a simple vocabulary
VOCAB = ["the", "a", "man", "woman", "eats", "reads", "book", "apple", "in", "park", "at", "home", "<PAD>", "<SOS>", "<EOS>"]
VOCAB_SIZE = len(VOCAB)
word2index = {word: idx for idx, word in enumerate(VOCAB)}
index2word = {idx: word for word, idx in word2index.items()}

# Generate random sentences
import random

def generate_sentence():
    sentence_length = random.randint(3, MAX_LENGTH)
    return [random.choice(VOCAB) for _ in range(sentence_length)]

def sentence_to_indices(sentence):
    return [word2index[word] for word in sentence]

NUM_REAL_SAMPLES = 1000
real_sentences = [generate_sentence() for _ in range(NUM_REAL_SAMPLES)]
real_sentences_indices = [sentence_to_indices(sentence) for sentence in real_sentences]

# Pad sentences to max length
real_data = torch.full((NUM_REAL_SAMPLES, MAX_LENGTH), word2index["<PAD>"], dtype=torch.long)
for i, sentence in enumerate(real_sentences_indices):
    real_data[i, :len(sentence)] = torch.LongTensor(sentence)

real_data[:5]  # Display first 5 samples

# Print each index and its corresponding word
index_word_mapping = {index: word for word, index in word2index.items()}

print("Index to Word Mapping:")
for idx, word in index_word_mapping.items():
    print(f"{idx}: {word}")

# Print first 10 generated sentences
print("\nFirst 10 Generated Sentences:")
for sentence in real_sentences[:10]:
    print(' '.join(sentence))


Index to Word Mapping:
0: the
1: a
2: man
3: woman
4: eats
5: reads
6: book
7: apple
8: in
9: park
10: at
11: home
12: <PAD>
13: <SOS>
14: <EOS>

First 10 Generated Sentences:
<SOS> reads the in <PAD> in park a
a man <PAD> book reads apple reads
home a eats woman woman in reads man book
<SOS> <PAD> a in man woman <SOS> at a <EOS>
woman home in <EOS> apple man <SOS>
<PAD> at <PAD> a apple
in book <PAD> eats eats
book book woman book <SOS> eats home home <EOS> <SOS>
a park reads at
apple the <SOS> in <PAD> park a woman


In [7]:
# Train the Transformer-based CVAE with Gumbel-softmax using the real simulated data

# We will train for 5 epochs for demonstration purposes
for epoch in range(0, 3):
    train(epoch, real_data, transformer_cvae, optimizer_gumbel)


Epoch: 0, Loss: 0.1436982194571174
Epoch: 1, Loss: 0.0009616988073394169
Epoch: 2, Loss: 0.0003444205750711262


In [None]:
class Agent:
    def __init__(self, model):
        self.model = model
    
    def encode(self, sentence):
        logits = self.model.encoder(sentence)
        z = self.model.reparameterize(logits)
        return z

    def decode(self, z, sentence_length=MAX_LENGTH):
        # Here, we will use a simple greedy decoding method
        decoded_sentence = torch.zeros(sentence_length, dtype=torch.long).unsqueeze(0)
        for i in range(sentence_length):
            logits = self.model.decoder(decoded_sentence, z)
            predicted_word = torch.argmax(logits, dim=-1)
            decoded_sentence[0, i] = predicted_word[0, i]
        return decoded_sentence

# Initialize a population of agents
NUM_AGENTS = 10
agents = [Agent(TransformerCVAE()) for _ in range(NUM_AGENTS)]

# Check initialization
agents[0]


In [None]:
def interaction_process(agents, data):
    total_interactions = 0
    successful_interactions = 0
    
    # For each agent as a sender
    for sender in agents:
        # Encode a random sentence into a signal
        sentence = random.choice(data).unsqueeze(0)
        encoded_signal = sender.encode(sentence)
        
        # All other agents try to decode the signal
        for receiver in agents:
            if receiver != sender:  # Ensure it's not the same agent
                decoded_sentence = receiver.decode(encoded_signal)
                if torch.all(decoded_sentence == sentence):
                    successful_interactions += 1
                total_interactions += 1

    return successful_interactions / total_interactions

success_rate = interaction_process(agents, real_data)
success_rate

代理初始化：
    我们将初始化一组代理。每个代理都有一个编码器（用于编码消息）和一个解码器（用于解码消息）。
交互过程：
    在每一代中，随机选择两个代理：一个作为发送者，另一个作为接收者。
    发送者选择一个句子并将其编码为一个信号。
    接收者尝试解码这个信号。
    评估解码的结果，如果接收者正确解码，则交互成功。
学习和进化：
    基于交互的成功与否，更新代理的编码器和解码器。
    代理可能会死亡、繁殖或发生变异，这取决于它们的交互成功率。
评估和迭代：
    跟踪每一代的成功交互率。
    可能会引入新的代理或去除性能较差的代理。

In [None]:
# Adjust the learning function to provide the required inputs to the decoder
def learn_from_interaction(sender, receiver, sentence, optimizer):
    optimizer.zero_grad()
    
    # Sender encodes the sentence
    encoded_signal = sender.encode(sentence)
    
    # Prepare a partial input for the decoder (starting with <SOS> tokens)
    partial_input = torch.full((sentence.shape[0], MAX_LENGTH), word2index["<PAD>"], dtype=torch.long)
    partial_input[:, 0] = word2index["<SOS>"]
    
    # Receiver tries to decode the signal
    decoded_sentence_logits = receiver.model.decoder(partial_input, encoded_signal)
    
    # Calculate loss and update the receiver model
    loss = F.cross_entropy(decoded_sentence_logits.permute(0, 2, 1), sentence)
    loss.backward()
    optimizer.step()

def train_agents_through_interactions(agents, data, epochs=1):
    optimizer = torch.optim.Adam([
        {'params': agent.model.parameters()} for agent in agents
    ])
    
    for epoch in range(epochs):
        # For each agent as a sender
        for sender in agents:
            # Encode a random sentence into a signal
            sentence = random.choice(data).unsqueeze(0)
            
            # All other agents try to decode the signal
            for receiver in agents:
                if receiver != sender:  # Ensure it's not the same agent
                    learn_from_interaction(sender, receiver, sentence, optimizer)

# Train agents through interactions again
train_agents_through_interactions(agents, real_data, epochs=1)

# Evaluate the success rate after training
success_rate_after_training = interaction_process(agents, real_data)
success_rate_after_training


In [None]:
def evolve_agents(agents, data, top_k=2, mutation_rate=0.01):
    # 1. Evaluate: calculate success rate for each agent
    success_rates = []
    for agent in agents:
        success_count = 0
        for _ in range(10):  # Evaluate each agent over 10 interactions
            success_count += interaction_process([agent] + agents, data)
        success_rates.append(success_count / 10)
    
    # 2. Select: Get top performing agents based on success rates
    top_agents = [agents[i] for i in sorted(range(len(success_rates)), key=lambda i: success_rates[i], reverse=True)[:top_k]]
    
    # 3. Reproduce: Clone top agents to replace the low performing ones
    for i in range(len(agents) - top_k):
        # Copy one of the top agents
        new_agent = copy.deepcopy(random.choice(top_agents))
        
        # 4. Mutate: apply small random noise to the agent's model parameters
        for param in new_agent.model.parameters():
            if random.random() < mutation_rate:
                noise = torch.randn_like(param) * mutation_rate
                param.data += noise
        
        agents[top_k + i] = new_agent
    
    return agents

# Evolve the agents based on their interactions
evolved_agents = evolve_agents(agents, real_data)

# Check the first evolved agent
evolved_agents[0]


In [None]:
def evolve_agents_lightweight(agents, data, top_k=2, mutation_rate=0.01):
    # 1. Evaluate: calculate success rate for each agent
    success_rates = []
    for agent in agents:
        success_count = 0
        for _ in range(10):  # Evaluate each agent over 10 interactions
            success_count += interaction_process([agent] + agents, data)
        success_rates.append(success_count / 10)
    
    # 2. Select: Get top performing agents based on success rates
    top_agents = [agents[i] for i in sorted(range(len(success_rates)), key=lambda i: success_rates[i], reverse=True)[:top_k]]
    
    # 3. Reproduce and Mutate: Clone the weights of top agents and apply random noise
    for i in range(len(agents) - top_k):
        # Copy weights from one of the top agents
        chosen_agent = random.choice(top_agents)
        new_agent = Agent(chosen_agent.model.__class__())  # Create a new instance of the same model
        new_agent.model.load_state_dict(chosen_agent.model.state_dict())  # Load the weights
        
        # 4. Mutate: apply small random noise to the agent's model parameters
        for param in new_agent.model.parameters():
            if random.random() < mutation_rate:
                noise = torch.randn_like(param) * mutation_rate
                param.data += noise
        
        agents[top_k + i] = new_agent
    
    return agents

# Evolve the agents again using the lightweight method
evolved_agents_lightweight = evolve_agents_lightweight(agents, real_data)

# Check the first evolved agent
evolved_agents_lightweight[0]
