In [1]:
pip install transformers torch pandas numpy scikit-learn




In [2]:
import pandas as pd

# Load dataset
df = pd.read_csv('content_data.csv')

# Split data into training and validation sets
train_df = df.sample(frac=0.8, random_state=42)
val_df = df.drop(train_df.index)


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import torch

# Load GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Tokenize the input data
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

train_encodings = tokenizer(train_df['text'].tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_df['text'].tolist(), truncation=True, padding=True)

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_df['engagement'].tolist())
val_dataset = CustomDataset(val_encodings, val_df['engagement'].tolist())

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()


In [None]:
def generate_content(prompt):
    inputs = tokenizer(prompt, return_tensors='pt')
    outputs = model.generate(inputs.input_ids, max_length=100, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

prompt = "Write a blog post about the benefits of AI in healthcare:"
generated_text = generate_content(prompt)
print(generated_text)


In [None]:
from sklearn.metrics import mean_squared_error

# Assuming you have engagement metrics for A/B testing
test_prompts = ["AI in healthcare", "AI in education"]
test_engagements = []

for prompt in test_prompts:
    generated_content = generate_content(prompt)
    # Simulate engagement by comparing to existing data
    closest_match = train_df.loc[train_df['text'].apply(lambda x: generated_content in x)]
    if not closest_match.empty:
        engagement = closest_match['engagement'].values[0]
    else:
        engagement = 0  # Assume low engagement for new content
    test_engagements.append(engagement)

# Evaluate the results of A/B testing
best_prompt_idx = test_engagements.index(max(test_engagements))
best_prompt = test_prompts[best_prompt_idx]
print(f"Best performing prompt: {best_prompt} with engagement {test_engagements[best_prompt_idx]}")
