In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pickle
import os
import pandas as pd
from tqdm import tqdm
import torch.nn.functional as F
# Load meta.pkl data


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Config:
    model_name = 'txtcnn'
    epoch = 50
    embedding_size = 128
    def __init__(self):
        pass




# Define custom dataset class for PyTorch
class RewardDataset(Dataset):
    def __init__(self, dataframe, stoi, vocab_size):
        self.data = dataframe
        self.stoi = stoi
        self.vocab_size = vocab_size

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get the prompt, response, and reward
        prompt = self.data.iloc[idx]["prompt"]
        true_response = self.data.iloc[idx]["true_response"]
        first_false_response = self.data.iloc[idx]["first_false_response"]
        second_false_response = self.data.iloc[idx]["second_false_response"]
        adversarial_false_response = self.data.iloc[idx]["adversarial_false_response"]


        # reward = self.data.iloc[idx]["Reward"]

        # Tokenize prompt and response (convert characters to integer tokens)
        prompt_tokens = self.tokenizer(prompt)
        true_response_tokens = self.tokenizer(true_response)
        first_false_response_tokens = self.tokenizer(first_false_response)
        second_false_response_tokens = self.tokenizer(second_false_response)
        adversarial_false_response_tokens = self.tokenizer(adversarial_false_response)

        # Pad or truncate tokens to fixed size (e.g., 1024 tokens)
        prompt_tokens = prompt_tokens[:64] + [0] * (64 - len(prompt_tokens))  # Padding
        true_response_tokens = true_response_tokens[:64] + [0] * (64 - len(true_response_tokens))  # Padding
        first_false_response_tokens = first_false_response_tokens[:64] + [0] * (64 - len(first_false_response_tokens))  # Padding
        second_false_response_tokens = second_false_response_tokens[:64] + [0] * (64 - len(second_false_response_tokens))  # Padding
        adversarial_false_response_tokens = adversarial_false_response_tokens[:64] + [0] * (64 - len(adversarial_false_response_tokens))  # Padding


        # return torch.tensor(prompt_tokens), torch.tensor(response_tokens), torch.tensor([reward], dtype=torch.float32)
        return torch.tensor(prompt_tokens), torch.tensor(true_response_tokens), torch.tensor(first_false_response_tokens), torch.tensor(second_false_response_tokens), torch.tensor(adversarial_false_response_tokens)

    def tokenizer(self, text):
        """Tokenize input text using stoi (string-to-index)."""
        return [self.stoi.get(c, 0) for c in text]  # Map characters to their respective token ids

# Define the neural network model for regression
class RewardRegressionModel(nn.Module):
    def __init__(self, vocab_size=86, embedding_dim=768, hidden_dim=512, output_dim=1):
        super(RewardRegressionModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(64 * embedding_dim * 2, hidden_dim)  # Multiply by 2 for prompt + response
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, prompt, response):
        # Embed the input sequences (prompt and response)
        prompt_emb = self.embedding(prompt).view(prompt.size(0), -1)  # Flatten to (batch_size, 1024 * embedding_dim)
        response_emb = self.embedding(response).view(response.size(0), -1)

        # Concatenate the embeddings
        combined = torch.cat((prompt_emb, response_emb), dim=1)

        # Pass through fully connected layers
        x = self.fc1(combined)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)

        return x


class Seq2SeqRewardModel(nn.Module):
    def __init__(self, vocab_size=86, embedding_dim=128, hidden_dim=256):
        super(Seq2SeqRewardModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.decoder = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)  # Output 1 for binary classification
        self.sigmoid = nn.Sigmoid()

    def forward(self, prompt, response):
        prompt_emb = self.embedding(prompt)
        response_emb = self.embedding(response)

        # Pass through encoder and decoder LSTMs
        _, (encoder_hidden, _) = self.encoder(prompt_emb)
        decoder_output, _ = self.decoder(response_emb, (encoder_hidden, encoder_hidden))

        # Use the hidden state to predict the reward (label 1 or 0)
        output = self.fc(decoder_output[:, -1, :])  # Get output for the last timestep
        output = self.sigmoid(output)

        return output

class GlobalMaxPool1d(nn.Module):
    def __init__(self):
        super(GlobalMaxPool1d, self).__init__()
    def forward(self, x):
         # x shape: (batch_size, channel, seq_len)
        return F.max_pool1d(x, kernel_size=x.shape[2]) # shape: (batch_size, channel, 1)

class TextCNN(nn.Module):
    def __init__(self, vocab_size = 86, embedding_dim = 768, kernel_sizes = [3,5,7,9,11], num_channels = [512,512,512,512,512]):
        super(TextCNN, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)  # embedding之后的shape: torch.Size([200, 8, 300])
        # self.word_embeddings = self.word_embeddings.from_pretrained(vectors, freeze=False)
        self.dropout = nn.Dropout(0.5)
        self.decoder = nn.Linear(sum(num_channels), 1)
        # 时序最大池化层没有权重，所以可以共用一个实例
        self.pool = GlobalMaxPool1d()
        self.convs = nn.ModuleList()  # 创建多个一维卷积层
        self.sigmoid = nn.Sigmoid()

        for c, k in zip(num_channels, kernel_sizes):
            self.convs.append(nn.Conv1d(in_channels = embedding_dim,
                                        out_channels = c,
                                        kernel_size = k))

    def forward(self, prompt, response):
        prompt_emb = self.word_embeddings(prompt)
        response_emb = self.word_embeddings(response)
        # print(embeds.shape)
        embeds = torch.cat((prompt_emb, response_emb), dim=1)
        embeds = embeds.permute(0, 2, 1)
        # 对于每个一维卷积层，在时序最大池化后会得到一个形状为(批量大小, 通道大小, 1)的
        # Tensor。使用flatten函数去掉最后一维，然后在通道维上连结
        # print(embeds)
        encoding = torch.cat([self.pool(F.relu(conv(embeds))).squeeze(-1) for conv in self.convs], dim=1)
        # 应用丢弃法后使用全连接层得到输出
        outputs = self.decoder(self.dropout(encoding))
        outputs = self.sigmoid(outputs)
        return outputs




# Training loop
def train_model(model, train_loader, valid_loader, criterion, optimizer, num_epochs=10):


    best_valid_loss = float('inf')
    best_accuracy = 0.0

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        progress_bar = tqdm(train_loader, desc=f"Epoch [{epoch + 1}/{num_epochs}]", leave=False)
        # Training phase
        for prompts,true_response,ff_response,sf_response,af_response in progress_bar:
            prompts,true_response,ff_response,sf_response,af_response = prompts.to(device),true_response.to(device),ff_response.to(device),sf_response.to(device),af_response.to(device)


            optimizer.zero_grad()
            true_outputs = model(prompts, true_response)
            ff_outputs = model(prompts, ff_response)
            sf_outputs = model(prompts, sf_response)
            af_outputs = model(prompts, af_response)

            true_ff_df = true_outputs - ff_outputs
            true_sf_df = true_outputs - sf_outputs
            true_af_df = true_outputs - af_outputs

            # true_ff_df = torch.functional.F.sigmoid(true_ff_df)
            # true_sf_df = torch.functional.F.sigmoid(true_sf_df)
            # true_af_df = torch.functional.F.sigmoid(true_af_df)

            loss_true_ff = criterion(true_ff_df, torch.ones_like(true_ff_df))
            loss_true_sf = criterion(true_sf_df, torch.ones_like(true_sf_df))
            loss_true_af = criterion(true_af_df, torch.ones_like(true_af_df))

            loss = (loss_true_ff + loss_true_sf + loss_true_af) / 3

            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        avg_train_loss = running_loss / len(train_loader)

        # Validation phase
        model.eval()
        valid_loss = 0.0
        count_right = 0
        with torch.no_grad():
            for prompts,true_response,ff_response,sf_response,af_response in valid_loader:
                prompts,true_response,ff_response,sf_response,af_response = prompts.to(device),true_response.to(device),ff_response.to(device),sf_response.to(device),af_response.to(device)

                true_outputs = model(prompts, true_response)
                ff_outputs = model(prompts, ff_response)
                sf_outputs = model(prompts, sf_response)
                af_outputs = model(prompts, af_response)



                true_ff_df = true_outputs - ff_outputs
                true_sf_df = true_outputs - sf_outputs
                true_af_df = true_outputs - af_outputs

                # true_ff_df = torch.functional.F.sigmoid(true_ff_df)
                # true_sf_df = torch.functional.F.sigmoid(true_sf_df)
                # true_af_df = torch.functional.F.sigmoid(true_af_df)

                loss_true_ff = criterion(true_ff_df, torch.ones_like(true_ff_df))
                loss_true_sf = criterion(true_sf_df, torch.ones_like(true_sf_df))
                loss_true_af = criterion(true_af_df, torch.ones_like(true_af_df))

                loss = (loss_true_ff + loss_true_sf + loss_true_af) / 3
                valid_loss += loss.item()

                # count_right += true_outputs.detach().cpu().numpy().sum()

                true_outputs = true_outputs.detach().cpu().numpy().tolist()
                ff_outputs = ff_outputs.detach().cpu().numpy().tolist()
                sf_outputs = sf_outputs.detach().cpu().numpy().tolist()
                af_outputs = af_outputs.detach().cpu().numpy().tolist()

                # print(true_outputs)
                # print(ff_outputs)
                # print(sf_outputs)
                # print(af_outputs)

                count_right += sum([1 if elem[0] > 0.5 else 0 for elem in true_outputs])
                count_right += sum([1 if elem[0] < 0.5 else 0 for elem in ff_outputs])
                count_right += sum([1 if elem[0] < 0.5 else 0 for elem in sf_outputs])
                count_right += sum([1 if elem[0] < 0.5 else 0 for elem in af_outputs])


        accuracy = count_right / 10000
        avg_valid_loss = valid_loss / len(valid_loader)

        print(f"Epoch [{epoch + 1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Valid Loss: {avg_valid_loss:.4f}")
        print(f"Accuracy: {accuracy}")


        if avg_valid_loss < best_valid_loss and accuracy > best_accuracy:
            best_valid_loss = avg_valid_loss
            best_accuracy = accuracy
            # Save the best model's parameters
            torch.save(model.state_dict(), save_path)
            print(f"New best model saved with validation loss: {best_valid_loss:.4f} with acc {accuracy:.4f}")

# Run training

if __name__ == "__main__":
    config = Config()
    # shakespeare_char = 'shakespeare_char'
    # data_dir = os.path.join('data', shakespeare_char)
    meta_path = 'meta.pkl'

    save_path = config.model_name + ".pth"



    if os.path.exists(meta_path):
        with open(meta_path, 'rb') as f:
            meta = pickle.load(f)
    # Use the previously loaded meta data
    vocab_size = meta['vocab_size']
    stoi = meta['stoi']  # string-to-index mapping
    itos = meta['itos']  # index-to-string mapping

    # reward_train_val = 'reward_train_val'
    # reward_data_dir = os.path.join('data', reward_train_val)
    reward_train_dir = 'Training_Set.csv'
    reward_valid_dir = 'Validation_Set.csv'

    train_set = pd.read_csv(reward_train_dir)
    valid_set = pd.read_csv(reward_valid_dir)

    # Create PyTorch Datasets and DataLoaders
    train_dataset = RewardDataset(train_set, stoi, vocab_size=vocab_size)
    valid_dataset = RewardDataset(valid_set, stoi, vocab_size=vocab_size)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)

    # Instantiate the model
    if config.model_name == "linear":
        model = RewardRegressionModel(vocab_size=vocab_size).to(device)
    elif config.model_name == "seq2seq":
        model = Seq2SeqRewardModel(vocab_size=vocab_size).to(device)
    elif config.model_name == "txtcnn":
        model = TextCNN(vocab_size=vocab_size).to(device)
    else:
        raise ValueError("Invalid model name")

    if os.path.exists(save_path):
        model.load_state_dict(torch.load(save_path))
        print(f"Model loaded from {save_path}")


    # Loss function and optimizer
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    train_model(model, train_loader, valid_loader, criterion, optimizer, num_epochs=config.epoch)

  model.load_state_dict(torch.load(save_path))


Model loaded from txtcnn.pth




Epoch [1/50], Train Loss: 0.3685, Valid Loss: 0.5486
Accuracy: 0.5834
New best model saved with validation loss: 0.5486 with acc 0.5834




Epoch [2/50], Train Loss: 0.3689, Valid Loss: 0.5693
Accuracy: 0.5965




Epoch [3/50], Train Loss: 0.3664, Valid Loss: 0.5663
Accuracy: 0.5936




Epoch [4/50], Train Loss: 0.3668, Valid Loss: 0.5630
Accuracy: 0.5924




Epoch [5/50], Train Loss: 0.3667, Valid Loss: 0.5548
Accuracy: 0.5829




Epoch [6/50], Train Loss: 0.3645, Valid Loss: 0.5407
Accuracy: 0.5691




Epoch [7/50], Train Loss: 0.3669, Valid Loss: 0.5671
Accuracy: 0.5916




Epoch [8/50], Train Loss: 0.3631, Valid Loss: 0.5624
Accuracy: 0.59




Epoch [9/50], Train Loss: 0.3656, Valid Loss: 0.5542
Accuracy: 0.5844




Epoch [10/50], Train Loss: 0.3648, Valid Loss: 0.5667
Accuracy: 0.5938




Epoch [11/50], Train Loss: 0.3628, Valid Loss: 0.5501
Accuracy: 0.5822




Epoch [12/50], Train Loss: 0.3648, Valid Loss: 0.5595
Accuracy: 0.589




Epoch [13/50], Train Loss: 0.3639, Valid Loss: 0.5541
Accuracy: 0.5839




Epoch [14/50], Train Loss: 0.3616, Valid Loss: 0.5565
Accuracy: 0.5925




Epoch [15/50], Train Loss: 0.3624, Valid Loss: 0.5724
Accuracy: 0.5951




Epoch [16/50], Train Loss: 0.3632, Valid Loss: 0.5475
Accuracy: 0.5826




Epoch [17/50], Train Loss: 0.3615, Valid Loss: 0.5486
Accuracy: 0.5806




Epoch [18/50], Train Loss: 0.3614, Valid Loss: 0.5556
Accuracy: 0.5879




Epoch [19/50], Train Loss: 0.3625, Valid Loss: 0.5752
Accuracy: 0.5982




Epoch [20/50], Train Loss: 0.3592, Valid Loss: 0.5710
Accuracy: 0.5974




Epoch [21/50], Train Loss: 0.3606, Valid Loss: 0.5531
Accuracy: 0.5773




Epoch [22/50], Train Loss: 0.3622, Valid Loss: 0.5606
Accuracy: 0.588




Epoch [23/50], Train Loss: 0.3596, Valid Loss: 0.5760
Accuracy: 0.5963




Epoch [24/50], Train Loss: 0.3620, Valid Loss: 0.5650
Accuracy: 0.5874




Epoch [25/50], Train Loss: 0.3600, Valid Loss: 0.5464
Accuracy: 0.5775




Epoch [26/50], Train Loss: 0.3586, Valid Loss: 0.5469
Accuracy: 0.5825




Epoch [27/50], Train Loss: 0.3592, Valid Loss: 0.5335
Accuracy: 0.5663




Epoch [28/50], Train Loss: 0.3591, Valid Loss: 0.5433
Accuracy: 0.5716




Epoch [29/50], Train Loss: 0.3591, Valid Loss: 0.5468
Accuracy: 0.5761




Epoch [30/50], Train Loss: 0.3591, Valid Loss: 0.5418
Accuracy: 0.5793




Epoch [31/50], Train Loss: 0.3578, Valid Loss: 0.5893
Accuracy: 0.6042




Epoch [32/50], Train Loss: 0.3580, Valid Loss: 0.5489
Accuracy: 0.5814




Epoch [33/50], Train Loss: 0.3596, Valid Loss: 0.5625
Accuracy: 0.591




Epoch [34/50], Train Loss: 0.3574, Valid Loss: 0.5701
Accuracy: 0.5956




Epoch [35/50], Train Loss: 0.3566, Valid Loss: 0.5612
Accuracy: 0.5892




Epoch [36/50], Train Loss: 0.3552, Valid Loss: 0.5638
Accuracy: 0.5943




Epoch [37/50], Train Loss: 0.3562, Valid Loss: 0.5413
Accuracy: 0.5764




Epoch [38/50], Train Loss: 0.3539, Valid Loss: 0.5575
Accuracy: 0.5911




Epoch [39/50], Train Loss: 0.3570, Valid Loss: 0.5636
Accuracy: 0.5945




Epoch [40/50], Train Loss: 0.3567, Valid Loss: 0.5661
Accuracy: 0.5891




Epoch [41/50], Train Loss: 0.3561, Valid Loss: 0.5820
Accuracy: 0.6012




Epoch [42/50], Train Loss: 0.3570, Valid Loss: 0.5647
Accuracy: 0.5867




Epoch [43/50], Train Loss: 0.3542, Valid Loss: 0.5599
Accuracy: 0.5868




Epoch [44/50], Train Loss: 0.3550, Valid Loss: 0.5402
Accuracy: 0.5704




Epoch [45/50], Train Loss: 0.3570, Valid Loss: 0.5771
Accuracy: 0.5949




Epoch [46/50], Train Loss: 0.3583, Valid Loss: 0.5585
Accuracy: 0.5766




Epoch [47/50], Train Loss: 0.3544, Valid Loss: 0.5556
Accuracy: 0.5778




Epoch [48/50], Train Loss: 0.3528, Valid Loss: 0.5632
Accuracy: 0.5845




Epoch [49/50], Train Loss: 0.3538, Valid Loss: 0.5617
Accuracy: 0.5817




Epoch [50/50], Train Loss: 0.3537, Valid Loss: 0.5562
Accuracy: 0.5805
