<a href="https://colab.research.google.com/github/Sayali19-cell/GAN-Dataset-for-gender-prediction/blob/main/Gender_Prediction_using_GAN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install torch transformers keras



In [None]:
import torch
import torch.nn as nn

class TextGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, label_dim):
        super(TextGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim + label_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, noise, labels):
        # Concatenate noise with gender labels
        input_embeds = self.embedding(noise)
        input_with_labels = torch.cat((input_embeds, labels.unsqueeze(1).repeat(1, input_embeds.size(1), 1)), dim=-1)
        lstm_out, _ = self.lstm(input_with_labels)
        return self.fc(lstm_out)


In [None]:
class TextDiscriminator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(TextDiscriminator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc_real_fake = nn.Linear(hidden_dim, 1)
        self.fc_gender = nn.Linear(hidden_dim, 1)  # For binary gender classification

    def forward(self, text):
        embeds = self.embedding(text)
        lstm_out, _ = self.lstm(embeds)
        real_fake_output = torch.sigmoid(self.fc_real_fake(lstm_out[:, -1, :]))
        gender_output = torch.sigmoid(self.fc_gender(lstm_out[:, -1, :]))
        return real_fake_output, gender_output

In [None]:
criterion = nn.BCELoss()

def train_gan(generator, discriminator, dataloader, num_epochs=50):
    for epoch in range(num_epochs):
        for real_texts, real_labels in dataloader:
            # Step 1: Train Discriminator
            optimizer_d.zero_grad()

            # Real samples
            real_validity, real_gender = discriminator(real_texts)
            real_loss = criterion(real_validity, torch.ones_like(real_validity)) + criterion(real_gender, real_labels)

            # Fake samples
            noise = torch.randint(0, vocab_size, (batch_size, seq_length))
            fake_labels = torch.randint(0, 2, (batch_size,))
            generated_texts = generator(noise, fake_labels)
            fake_validity, fake_gender = discriminator(generated_texts.detach())
            fake_loss = criterion(fake_validity, torch.zeros_like(fake_validity)) + criterion(fake_gender, fake_labels)

            d_loss = (real_loss + fake_loss) / 2
            d_loss.backward()
            optimizer_d.step()

            # Step 2: Train Generator
            optimizer_g.zero_grad()
            fake_validity, fake_gender = discriminator(generated_texts)
            g_loss = criterion(fake_validity, torch.ones_like(fake_validity)) + criterion(fake_gender, fake_labels)
            g_loss.backward()
            optimizer_g.step()

        print(f"Epoch {epoch}/{num_epochs} - Generator Loss: {g_loss.item()}, Discriminator Loss: {d_loss.item()}")

In [None]:
# Step 1: Define the hyperparameters
vocab_size = 30000     # Vocabulary size (you can change it based on your needs)
embedding_dim = 128    # Size of word embeddings
hidden_dim = 256       # Size of the hidden layer in LSTM
label_dim = 1          # Binary gender label (0 or 1)
seq_length = 50        # Length of each generated text sequence

# Step 2: Initialize the generator
generator = TextGenerator(vocab_size=vocab_size, embedding_dim=embedding_dim, hidden_dim=hidden_dim, label_dim=label_dim)

# Step 3: Define the data generation function
def generate_synthetic_data(generator, num_samples):
    synthetic_texts = []
    synthetic_labels = []

    for _ in range(num_samples):
        noise = torch.randint(0, vocab_size, (1, seq_length))  # Random noise as input
        gender_label = torch.randint(0, 2, (1,))  # Random binary gender label
        generated_text = generator(noise, gender_label)  # Generate text based on noise and gender label
        synthetic_texts.append(generated_text)
        synthetic_labels.append(gender_label)

    return synthetic_texts, synthetic_labels

# Step 4: Generate 1000 synthetic text samples
synthetic_texts, synthetic_labels = generate_synthetic_data(generator, 1000)

# Optional: You can print a sample output to check if it's working
print(f"Sample Generated Text: {synthetic_texts[0]}")
print(f"Sample Generated Label: {synthetic_labels[0]}")


Sample Generated Text: tensor([[[-0.0142,  0.0172,  0.0940,  ..., -0.0406,  0.0274, -0.0593],
         [ 0.0047, -0.0125, -0.0675,  ..., -0.1168,  0.0125, -0.1397],
         [ 0.0919, -0.0300, -0.1197,  ..., -0.1022, -0.1191, -0.0996],
         ...,
         [-0.0261,  0.0226, -0.0685,  ...,  0.0218,  0.0150, -0.0674],
         [ 0.0134,  0.0607, -0.0385,  ..., -0.0683,  0.1098, -0.0821],
         [ 0.0494,  0.0483,  0.0617,  ..., -0.0943,  0.0496, -0.1385]]],
       grad_fn=<ViewBackward0>)
Sample Generated Label: tensor([1])


In [None]:
import pandas as pd

# Create a DataFrame for the synthetic dataset
synthetic_df = pd.DataFrame({
    'text': [text.detach().numpy() for text in synthetic_texts], # Detach tensors and convert to NumPy arrays
    'gender': synthetic_labels
})

# Save it as a CSV file for the next phase of fine-tuning
synthetic_df.to_csv('synthetic_gender_texts.csv', index=False)